122 lines
3.9 KiB
Python
122 lines
3.9 KiB
Python
import logging
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import pandas as pd
|
|
|
|
import pdfplumber
|
|
|
|
from pdf_oralia.pages import charge, locataire, patrimoine
|
|
|
|
extract_table_settings = {
|
|
"vertical_strategy": "lines",
|
|
"horizontal_strategy": "text",
|
|
}
|
|
|
|
|
|
def extract_date(page_text):
|
|
"""Extract date from a page
|
|
|
|
:param page_text: text in the page
|
|
:return: the extracted date
|
|
"""
|
|
blocs = page_text.split("\n")
|
|
for b in blocs:
|
|
if "Lyon le" in b:
|
|
words = b.split(" ")
|
|
return datetime.strptime(words[-1], "%d/%m/%Y")
|
|
|
|
|
|
def extract_building(page_text, buildings=["bloch", "marietton", "servient"]):
|
|
for building in buildings:
|
|
if building in page_text.lower():
|
|
return building
|
|
raise ValueError("Pas d'immeuble trouvé")
|
|
|
|
|
|
def pdf_extract_tables_lines(pdf):
|
|
loc_sink = locataire.fsm()
|
|
next(loc_sink)
|
|
charge_sink = charge.fsm()
|
|
next(charge_sink)
|
|
patrimoine_sink = patrimoine.fsm()
|
|
next(patrimoine_sink)
|
|
|
|
page_number = 1
|
|
for page in pdf.pages:
|
|
page_text = page.extract_text()
|
|
date = extract_date(page_text)
|
|
additionnal_fields = {
|
|
"immeuble": extract_building(page_text),
|
|
"mois": date.strftime("%m"),
|
|
"annee": date.strftime("%Y"),
|
|
}
|
|
table_type = ""
|
|
if locataire.is_it(page_text):
|
|
table_type = "locataire"
|
|
elif charge.is_it(page_text):
|
|
table_type = "charge"
|
|
elif patrimoine.is_it(page_text):
|
|
table_type = "patrimoine"
|
|
else:
|
|
logging.warning(f"Page {page_number} non reconnu. Page ignorée.")
|
|
|
|
for line in page.extract_table(extract_table_settings):
|
|
if table_type == "locataire":
|
|
res = loc_sink.send(line)
|
|
if res:
|
|
res.update(additionnal_fields)
|
|
yield locataire.Line(**res)
|
|
elif table_type == "charge":
|
|
res = charge_sink.send(line)
|
|
if res:
|
|
res.update(additionnal_fields)
|
|
yield charge.Line(**res)
|
|
|
|
elif table_type == "patrimoine":
|
|
res = patrimoine_sink.send(line)
|
|
if res:
|
|
res.update(additionnal_fields)
|
|
yield patrimoine.Line(**res)
|
|
|
|
page_number += 1
|
|
|
|
|
|
def from_pdf(pdf_file):
|
|
"""Build dataframes one about charges and another on loc"""
|
|
pdf = pdfplumber.open(pdf_file)
|
|
locataire_lines = []
|
|
charge_lines = []
|
|
patrimoine_lines = []
|
|
for line in pdf_extract_tables_lines(pdf):
|
|
if isinstance(line, locataire.Line):
|
|
locataire_lines.append(line)
|
|
elif isinstance(line, charge.Line):
|
|
charge_lines.append(line)
|
|
elif isinstance(line, patrimoine.Line):
|
|
patrimoine_lines.append(line)
|
|
else:
|
|
logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")
|
|
|
|
return (
|
|
pd.DataFrame([c.__dict__ for c in charge_lines]),
|
|
pd.DataFrame([c.__dict__ for c in locataire_lines]),
|
|
pd.DataFrame([c.__dict__ for c in patrimoine_lines]),
|
|
)
|
|
|
|
|
|
def extract_save(pdf_file, dest):
|
|
"""Extract charge and locataire for pdf_file and put xlsx file in dest"""
|
|
pdf_file = Path(pdf_file)
|
|
xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
|
|
xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
|
|
xls_patrimoine = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx"
|
|
|
|
df_charge, df_loc, df_patrimoine = from_pdf(pdf_file)
|
|
|
|
df_charge.to_excel(xls_charge, sheet_name="Charges", index=False)
|
|
logging.info(f"{xls_charge} saved")
|
|
df_loc.to_excel(xls_locataire, sheet_name="Location", index=False)
|
|
logging.info(f"{xls_locataire} saved")
|
|
df_patrimoine.to_excel(xls_patrimoine, sheet_name="Patrimoine", index=False)
|
|
logging.info(f"{xls_patrimoine} saved")
|