import logging from datetime import datetime from pathlib import Path import pandas as pd import pdfplumber from pdf_oralia.pages import charge, locataire, patrimoine extract_table_settings = { "vertical_strategy": "lines", "horizontal_strategy": "text", } def extract_date(page_text): """Extract date from a page :param page_text: text in the page :return: the extracted date """ blocs = page_text.split("\n") for b in blocs: if "Lyon le" in b: words = b.split(" ") return datetime.strptime(words[-1], "%d/%m/%Y") def extract_building(page_text, buildings=["bloch", "marietton", "servient"]): for building in buildings: if building in page_text.lower(): return building raise ValueError("Pas d'immeuble trouvé") def pdf_extract_tables_lines(pdf): loc_sink = locataire.fsm() next(loc_sink) charge_sink = charge.fsm() next(charge_sink) patrimoine_sink = patrimoine.fsm() next(patrimoine_sink) page_number = 1 for page in pdf.pages: page_text = page.extract_text() date = extract_date(page_text) additionnal_fields = { "immeuble": extract_building(page_text), "mois": date.strftime("%m"), "annee": date.strftime("%Y"), } table_type = "" if locataire.is_it(page_text): table_type = "locataire" elif charge.is_it(page_text): table_type = "charge" elif patrimoine.is_it(page_text): table_type = "patrimoine" else: logging.warning(f"Page {page_number} non reconnu. Page ignorée.") for line in page.extract_table(extract_table_settings): if table_type == "locataire": res = loc_sink.send(line) if res: res.update(additionnal_fields) yield locataire.Line(**res) elif table_type == "charge": res = charge_sink.send(line) if res: res.update(additionnal_fields) yield charge.Line(**res) elif table_type == "patrimoine": res = patrimoine_sink.send(line) if res: res.update(additionnal_fields) yield patrimoine.Line(**res) page_number += 1 def from_pdf(pdf_file): """Build dataframes one about charges and another on loc""" pdf = pdfplumber.open(pdf_file) locataire_lines = [] charge_lines = [] patrimoine_lines = [] for line in pdf_extract_tables_lines(pdf): if isinstance(line, locataire.Line): locataire_lines.append(line) elif isinstance(line, charge.Line): charge_lines.append(line) elif isinstance(line, patrimoine.Line): patrimoine_lines.append(line) else: logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.") return ( pd.DataFrame([c.__dict__ for c in charge_lines]), pd.DataFrame([c.__dict__ for c in locataire_lines]), pd.DataFrame([c.__dict__ for c in patrimoine_lines]), ) def extract_save(pdf_file, dest): """Extract charge and locataire for pdf_file and put xlsx file in dest""" pdf_file = Path(pdf_file) xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx" xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx" xls_patrimoine = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx" df_charge, df_loc, df_patrimoine = from_pdf(pdf_file) df_charge.to_excel(xls_charge, sheet_name="Charges", index=False) logging.info(f"{xls_charge} saved") df_loc.to_excel(xls_locataire, sheet_name="Location", index=False) logging.info(f"{xls_locataire} saved") df_patrimoine.to_excel(xls_patrimoine, sheet_name="Patrimoine", index=False) logging.info(f"{xls_patrimoine} saved")