From e3cc7d18a2c4218b878c88ede124361adb77720e Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Tue, 27 Sep 2022 16:07:06 +0200 Subject: [PATCH] Feat: split extract --- pdf_oralia/extract.py | 57 ++------------------------------- pdf_oralia/extract_charge.py | 17 ++++++++++ pdf_oralia/extract_locataire.py | 40 +++++++++++++++++++++++ 3 files changed, 59 insertions(+), 55 deletions(-) create mode 100644 pdf_oralia/extract_charge.py create mode 100644 pdf_oralia/extract_locataire.py diff --git a/pdf_oralia/extract.py b/pdf_oralia/extract.py index 69593ee..0058a10 100644 --- a/pdf_oralia/extract.py +++ b/pdf_oralia/extract.py @@ -1,62 +1,9 @@ from pathlib import Path -import click -import numpy as np -import pandas as pd import pdfplumber - -def extract_situation_loc(table): - try: - df = pd.DataFrame(table[1:], columns=table[0]) - except IndexError: - print(table) - rows = [] - for i, row in df[df["Locataires"] == "Totaux"].iterrows(): - above_row_loc = df.iloc[i - 1]["Locataires"] - up_row = pd.concat( - [ - row, - parse_above_loc(above_row_loc), - ] - ) - - rows.append(up_row) - df_cleaned = pd.concat(rows, axis=1).T - df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True) - return df_cleaned - - -def parse_above_loc(content): - row = {} - try: - app, loc = content.split("\n") - except ValueError: - row["lot"] = "" - row["type"] = "" - row["locataire"] = content - - else: - app_ = app.split(" ") - row["lot"] = app_[1] - row["type"] = " ".join(app_[2:]) - row["locataire"] = loc - return pd.Series(row) - - -def extract_charge(table): - df = ( - pd.DataFrame(table[1:], columns=table[0]) - .replace("", np.nan) - .dropna(subset=["Débits"]) - ) - drop_index = df[ - df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False) - | df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False) - ].index - df.drop(drop_index, inplace=True) - return df - +from .extract_charge import extract_charge +from .extract_locataire import extract_situation_loc charge_table_settings = { "vertical_strategy": "lines", diff --git a/pdf_oralia/extract_charge.py b/pdf_oralia/extract_charge.py new file mode 100644 index 0000000..4705104 --- /dev/null +++ b/pdf_oralia/extract_charge.py @@ -0,0 +1,17 @@ +import numpy as np +import pandas as pd + + +def extract_charge(table): + """From pdfplumber table extract the charge dataframe""" + df = ( + pd.DataFrame(table[1:], columns=table[0]) + .replace("", np.nan) + .dropna(subset=["Débits"]) + ) + drop_index = df[ + df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False) + | df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False) + ].index + df.drop(drop_index, inplace=True) + return df diff --git a/pdf_oralia/extract_locataire.py b/pdf_oralia/extract_locataire.py new file mode 100644 index 0000000..8053d42 --- /dev/null +++ b/pdf_oralia/extract_locataire.py @@ -0,0 +1,40 @@ +import pandas as pd + + +def parse_above_loc(content): + row = {} + try: + app, loc = content.split("\n") + except ValueError: + row["lot"] = "" + row["type"] = "" + row["locataire"] = content + + else: + app_ = app.split(" ") + row["lot"] = app_[1] + row["type"] = " ".join(app_[2:]) + row["locataire"] = loc + return pd.Series(row) + + +def extract_situation_loc(table): + """From pdfplumber table extract locataire df""" + try: + df = pd.DataFrame(table[1:], columns=table[0]) + except IndexError: + print(table) + rows = [] + for i, row in df[df["Locataires"] == "Totaux"].iterrows(): + above_row_loc = df.iloc[i - 1]["Locataires"] + up_row = pd.concat( + [ + row, + parse_above_loc(above_row_loc), + ] + ) + + rows.append(up_row) + df_cleaned = pd.concat(rows, axis=1).T + df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True) + return df_cleaned