from pathlib import Path import click import numpy as np import pandas as pd import pdfplumber def extract_situation_loc(table): try: df = pd.DataFrame(table[1:], columns=table[0]) except IndexError: print(table) rows = [] for i, row in df[df["Locataires"] == "Totaux"].iterrows(): above_row_loc = df.iloc[i - 1]["Locataires"] up_row = pd.concat( [ row, parse_above_loc(above_row_loc), ] ) rows.append(up_row) df_cleaned = pd.concat(rows, axis=1).T df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True) return df_cleaned def parse_above_loc(content): row = {} try: app, loc = content.split("\n") except ValueError: row["lot"] = "" row["type"] = "" row["locataire"] = content else: app_ = app.split(" ") row["lot"] = app_[1] row["type"] = " ".join(app_[2:]) row["locataire"] = loc return pd.Series(row) def extract_charge(table): df = ( pd.DataFrame(table[1:], columns=table[0]) .replace("", np.nan) .dropna(subset=["Débits"]) ) drop_index = df[ df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False) or df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False) ].index df.drop(drop_index, inplace=True) return df charge_table_settings = { "vertical_strategy": "lines", "horizontal_strategy": "text", } def extract_from_pdf(pdf, charge_dest, location_dest): loc_table = [] for page in pdf.pages: situation_loc_line = [ l for l in page.extract_text().split("\n") if "SITUATION DES LOCATAIRES" in l ] if situation_loc_line: mois, annee = situation_loc_line[0].split(" ")[-2:] if loc_table: loc_table += page.extract_table()[1:] else: loc_table = page.extract_table() if "HONORAIRES" in page.extract_text(): df_charge = extract_charge(page.extract_table(charge_table_settings)) df_charge.to_excel(charge_dest, sheet_name="Charges", index=False) print(f"{charge_dest} saved") df_loc = extract_situation_loc(loc_table) df_loc = df_loc.assign(mois=mois, annee=annee) df_loc.to_excel(location_dest, sheet_name="Location", index=False) print(f"{location_dest} saved") def extract_save(pdf_file): pdf_file = Path(pdf_file) xls_charge = f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx" xls_locataire = f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx" pdf = pdfplumber.open(pdf_file) extract_from_pdf(pdf, xls_charge, xls_locataire) @click.command() @click.option("--pdf_file", help="Nom du fichier pdf", default="") @click.option("--folder", help="Tous les fichiers dans folder", default="./") def pdf2xlsx(pdf_file, folder): if pdf_file: extract_save(pdf_file) else: p = Path(folder) pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)] for pdf_file in pdf_files: extract_save(pdf_file) if __name__ == "__main__": pdf2xlsx()