diff --git a/pdf_oralia/extract.py b/pdf_oralia/extract.py index 3ad05ac..d1a7599 100644 --- a/pdf_oralia/extract.py +++ b/pdf_oralia/extract.py @@ -41,15 +41,20 @@ def pdf_extract_tables_lines(pdf): patrimoine_sink = patrimoine.fsm() next(patrimoine_sink) - page_number = 1 - for page in pdf.pages: + for page_number, page in enumerate(pdf.pages): page_text = page.extract_text() date = extract_date(page_text) - additionnal_fields = { - "immeuble": extract_building(page_text), - "mois": date.strftime("%m"), - "annee": date.strftime("%Y"), - } + try: + additionnal_fields = { + "immeuble": extract_building(page_text), + "mois": date.strftime("%m"), + "annee": date.strftime("%Y"), + } + except ValueError: + logging.warning( + f"L'immeuble de la page {page_number+1} non identifiable. Page ignorée." + ) + continue table_type = "" if locataire.is_it(page_text): table_type = "locataire" @@ -58,7 +63,10 @@ def pdf_extract_tables_lines(pdf): elif patrimoine.is_it(page_text): table_type = "patrimoine" else: - logging.warning(f"Page {page_number} non reconnu. Page ignorée.") + logging.warning( + f"Type de la page {page_number+1} non identifiable. Page ignorée." + ) + continue for line in page.extract_table(extract_table_settings): if table_type == "locataire": @@ -78,8 +86,6 @@ def pdf_extract_tables_lines(pdf): res.update(additionnal_fields) yield patrimoine.Line(**res) - page_number += 1 - def from_pdf(pdf_file): """Build dataframes one about charges and another on loc""" @@ -97,25 +103,32 @@ def from_pdf(pdf_file): else: logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.") - return ( - pd.DataFrame([c.__dict__ for c in charge_lines]), - pd.DataFrame([c.__dict__ for c in locataire_lines]), - pd.DataFrame([c.__dict__ for c in patrimoine_lines]), - ) + return { + "charge": pd.DataFrame([c.__dict__ for c in charge_lines]), + "locataire": pd.DataFrame([c.__dict__ for c in locataire_lines]), + "patrimoine": pd.DataFrame([c.__dict__ for c in patrimoine_lines]), + } -def extract_save(pdf_file, dest): +def extract_plan(pdf_file, dest): + return { + "charge": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx", + "locataire": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx", + "patrimoine": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx", + } + + +def extract_save(pdf_file, dest, save=[]): """Extract charge and locataire for pdf_file and put xlsx file in dest""" pdf_file = Path(pdf_file) - xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx" - xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx" - xls_patrimoine = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx" + xlss = extract_plan(pdf_file, dest) - df_charge, df_loc, df_patrimoine = from_pdf(pdf_file) + if save != []: + dfs = from_pdf(pdf_file) - df_charge.to_excel(xls_charge, sheet_name="Charges", index=False) - logging.info(f"{xls_charge} saved") - df_loc.to_excel(xls_locataire, sheet_name="Location", index=False) - logging.info(f"{xls_locataire} saved") - df_patrimoine.to_excel(xls_patrimoine, sheet_name="Patrimoine", index=False) - logging.info(f"{xls_patrimoine} saved") + for s in save: + dfs[s].to_excel(xlss[s], sheet_name=s, index=False) + logging.info(f"{xlss[s]} saved") + return {k: v for k, v in xlss.items() if k in save} + + return xlss diff --git a/pdf_oralia/scripts.py b/pdf_oralia/scripts.py index 5ab76a9..a0cd3ad 100644 --- a/pdf_oralia/scripts.py +++ b/pdf_oralia/scripts.py @@ -4,7 +4,7 @@ from pathlib import Path import click -from .extract import extract_save +from .extract import extract_save, extract_plan from .join import join_excel @@ -51,18 +51,45 @@ def on(pdf_file, dest): @extract.command() -@click.option("--src", help="Tous les fichiers dans folder", default="./") +@click.option( + "--src", help="Tous les fichiers dans folder (de façon récursive)", default="./" +) @click.option("--dest", help="Où mettre les fichiers produits", default="./") -def all(src, dest): - p = Path(src) +@click.option( + "--only-plan", + help="Ne produit rien mais indique les changements", + default=False, + is_flag=True, +) +@click.option( + "--force", + help="Écrase les fichiers produits précédemment", + default=False, + is_flag=True, +) +def all(src, dest, force, only_plan): + src_path = Path(src) - d = Path(dest) - d.mkdir(exist_ok=True) + dest = Path(dest) + dest.mkdir(exist_ok=True) - pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)] - for pdf_file in pdf_files: + for pdf_file in src_path.rglob("**/*.pdf"): + relative_path = pdf_file.relative_to(src_path) + files_dest = dest / relative_path.parent logging.info(f"Found {pdf_file}") - extract_save(pdf_file, d) + + plan_dest = extract_plan(pdf_file, files_dest) + save = [] + for k, p in plan_dest.items(): + if not p.exists() or force: + save.append(k) + + if only_plan: + for s in save: + logging.info(f"Planing to create {plan_dest[s]}") + else: + files_dest.mkdir(parents=True, exist_ok=True) + extract_save(pdf_file, files_dest, save) @main.command()