diff --git a/pdf_auralia/__init__.py b/pdf_oralia/__init__.py similarity index 100% rename from pdf_auralia/__init__.py rename to pdf_oralia/__init__.py diff --git a/pdf_auralia/extract.py b/pdf_oralia/extract.py similarity index 100% rename from pdf_auralia/extract.py rename to pdf_oralia/extract.py diff --git a/pdf_oralia/scripts.py b/pdf_oralia/scripts.py new file mode 100644 index 0000000..62bdc38 --- /dev/null +++ b/pdf_oralia/scripts.py @@ -0,0 +1,116 @@ +from pathlib import Path + +import click +import numpy as np +import pandas as pd +import pdfplumber + + +def extract_situation_loc(table): + try: + df = pd.DataFrame(table[1:], columns=table[0]) + except IndexError: + print(table) + rows = [] + for i, row in df[df["Locataires"] == "Totaux"].iterrows(): + above_row_loc = df.iloc[i - 1]["Locataires"] + up_row = pd.concat( + [ + row, + parse_above_loc(above_row_loc), + ] + ) + + rows.append(up_row) + df_cleaned = pd.concat(rows, axis=1).T + df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True) + return df_cleaned + + +def parse_above_loc(content): + row = {} + try: + app, loc = content.split("\n") + except ValueError: + row["lot"] = "" + row["type"] = "" + row["locataire"] = content + + else: + app_ = app.split(" ") + row["lot"] = app_[1] + row["type"] = " ".join(app_[2:]) + row["locataire"] = loc + return pd.Series(row) + + +def extract_charge(table): + df = ( + pd.DataFrame(table[1:], columns=table[0]) + .replace("", np.nan) + .dropna(subset=["Débits"]) + ) + drop_index = df[ + df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False) + or df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False) + ].index + df.drop(drop_index, inplace=True) + return df + + +charge_table_settings = { + "vertical_strategy": "lines", + "horizontal_strategy": "text", +} + + +def extract_from_pdf(pdf, charge_dest, location_dest): + loc_table = [] + for page in pdf.pages: + situation_loc_line = [ + l + for l in page.extract_text().split("\n") + if "SITUATION DES LOCATAIRES" in l + ] + if situation_loc_line: + mois, annee = situation_loc_line[0].split(" ")[-2:] + if loc_table: + loc_table += page.extract_table()[1:] + else: + loc_table = page.extract_table() + + if "HONORAIRES" in page.extract_text(): + df_charge = extract_charge(page.extract_table(charge_table_settings)) + df_charge.to_excel(charge_dest, sheet_name="Charges", index=False) + print(f"{charge_dest} saved") + + df_loc = extract_situation_loc(loc_table) + df_loc = df_loc.assign(mois=mois, annee=annee) + df_loc.to_excel(location_dest, sheet_name="Location", index=False) + print(f"{location_dest} saved") + + +def extract_save(pdf_file): + pdf_file = Path(pdf_file) + xls_charge = f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx" + xls_locataire = f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx" + + pdf = pdfplumber.open(pdf_file) + extract_from_pdf(pdf, xls_charge, xls_locataire) + + +@click.command() +@click.option("--pdf_file", help="Nom du fichier pdf", default="") +@click.option("--folder", help="Tous les fichiers dans folder", default="./") +def pdf2xlsx(pdf_file, folder): + if pdf_file: + extract_save(pdf_file) + else: + p = Path(folder) + pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)] + for pdf_file in pdf_files: + extract_save(pdf_file) + + +if __name__ == "__main__": + pdf2xlsx() diff --git a/pyproject.toml b/pyproject.toml index 711c9f3..26ae8e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,13 @@ [tool.poetry] -name = "pdf-auralia" +name = "pdf-oralia" version = "0.1.0" description = "" authors = ["Bertrand Benjamin "] readme = "README.md" -packages = [{include = "pdf_auralia"}] +packages = [{include = "pdf_oralia"}] + +[tool.poetry.script] +pdf-oralia = "pdf_oralia.scripts:main" [tool.poetry.dependencies] python = "^3.10"