Feat: marche avec les pdfs tous ensembles

2023-06-16 08:32:36 +02:00
parent 1afb2a32ab
commit 8a55e6e2cc
9 changed files with 303 additions and 216 deletions
--- a/pdf_oralia/pages/init.py
+++ b/pdf_oralia/pages/init.py
@@ -0,0 +1 @@
+from . import charge, locataire, patrimoine, recapitulatif
--- a/pdf_oralia/pages/charge.py
+++ b/pdf_oralia/pages/charge.py
@@ -0,0 +1,72 @@
+import numpy as np
+import pandas as pd
+
+RECAPITULATIF_DES_OPERATION = 1
+
+
+def is_it(page_text):
+    if (
+        "RECAPITULATIF DES OPERATIONS" in page_text
+        and "COMPTE RENDU DE GESTION" not in page_text
+    ):
+        return True
+    return False
+
+
+def get_lot(x):
+    """Return lot number from "RECAPITULATIF DES OPERATIONS" """
+    if x[:2].isdigit():
+        return x[:2]
+    if x[:1].isdigit():
+        return "0" + x[:1]
+    if x[:2] == "PC":
+        return "PC"
+    return ""
+
+
+def keep_row(row):
+    return not any(
+        [
+            word.lower() in row[RECAPITULATIF_DES_OPERATION].lower()
+            for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
+        ]
+    )
+
+
+def extract(table, additionnal_fields: dict = {}):
+    """Turn table to dictionary with additionnal fields"""
+    extracted = []
+    header = table[0]
+    for row in table[1:]:
+        if keep_row(row):
+            r = dict()
+            for i, value in enumerate(row):
+                if header[i] == "":
+                    r["Fournisseur"] = value
+                else:
+                    r[header[i]] = value
+
+            for k, v in additionnal_fields.items():
+                r[k] = v
+
+            r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATION])
+
+            if "honoraire" in row[RECAPITULATIF_DES_OPERATION]:
+                r["Fournisseur"] = "IMI GERANCE"
+
+            extracted.append(r)
+
+    return extracted
+
+
+def table2df(tables):
+    dfs = []
+    for table in tables:
+        df = (
+            pd.DataFrame.from_records(table)
+            .replace("", np.nan)
+            .dropna(subset=["Débits", "Crédits"], how="all")
+        )
+        df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
+        dfs.append(df)
+    return pd.concat(dfs)
--- a/pdf_oralia/pages/locataire.py
+++ b/pdf_oralia/pages/locataire.py
@@ -0,0 +1,134 @@
+import pandas as pd
+
+
+def is_it(page_text):
+    if "SITUATION DES LOCATAIRES" in page_text:
+        return True
+    return False
+
+
+def is_drop(row):
+    if "totaux" in row[0].lower():
+        return True
+    if not any(row):
+        return True
+    return False
+
+
+def extract(table, additionnal_fields: dict = {}):
+    """Turn table to dictionary with additionnal fields"""
+    extracted = []
+    header = table[0]
+    for row in table[1:]:
+        if not is_drop(row):
+            r = dict()
+            for i, value in enumerate(row):
+                if header[i] != "":
+                    r[header[i]] = value
+            for k, v in additionnal_fields.items():
+                r[k] = v
+            extracted.append(r)
+    return extracted
+
+
+def join_row(last, next):
+    row = {}
+    for key in last:
+        if last[key] == next[key]:
+            row[key] = last[key]
+        elif last[key] and next[key]:
+            row[key] = f"{last[key]}\n{next[key]}"
+        elif last[key]:
+            row[key] = last[key]
+        elif next[key]:
+            row[key] = next[key]
+        else:
+            row[key] = ""
+    return row
+
+
+def join_tables(tables):
+    joined = tables[0]
+
+    for t in tables[1:]:
+        last_row = joined[-1]
+        if "totaux" not in last_row["Locataires"].lower():
+            first_row = t[0]
+            joined_row = join_row(last_row, first_row)
+            joined = joined[:-1] + [joined_row] + t[1:]
+        else:
+            joined += t
+
+    return joined
+
+
+def parse_lot(string):
+    words = string.split(" ")
+    return {"Lot": words[1], "Type": " ".join(words[2:])}
+
+
+def join_row(table):
+    joined = []
+    for row in table:
+        if row["Locataires"].startswith("Lot"):
+            row.update(parse_lot(row["Locataires"]))
+            row["Locataires"] = ""
+            joined.append(row)
+        elif row["Locataires"] == "Rappel de Loyer":
+            last_row = joined[-1]
+            row.update(
+                {
+                    "Lot": last_row["Lot"],
+                    "Type": last_row["Type"],
+                    "Locataires": last_row["Locataires"],
+                    "Divers": "Rappel de Loyer",
+                }
+            )
+            joined.append(row)
+
+        elif row["Locataires"]:
+            last_row = joined.pop()
+            row_name = row["Locataires"].replace("\n", " ")
+            row.update({k: v for k, v in last_row.items() if v})
+            row["Locataires"] = last_row["Locataires"] + " " + row_name
+            joined.append(row)
+
+        else:
+            if row["Période"].startswith("Solde"):
+                last_row = joined.pop()
+                row.update(
+                    {
+                        "Lot": last_row["Lot"],
+                        "Type": last_row["Type"],
+                        "Locataires": last_row["Locataires"],
+                    }
+                )
+                joined.append(row)
+
+            elif row["Période"].startswith("Du"):
+                last_row = joined[-1]
+                row.update(
+                    {
+                        "Lot": last_row["Lot"],
+                        "Type": last_row["Type"],
+                        "Locataires": last_row["Locataires"],
+                    }
+                )
+                joined.append(row)
+            else:
+                print(row)
+
+    return joined
+
+
+def flat_tables(tables):
+    tables_flat = []
+    for table in tables:
+        tables_flat.extend(table)
+    return tables_flat
+
+
+def table2df(tables):
+    tables = flat_tables(tables)
+    joined = join_row(tables)
+    return pd.DataFrame.from_records(joined)
--- a/pdf_oralia/pages/patrimoine.py
+++ b/pdf_oralia/pages/patrimoine.py
@@ -0,0 +1,4 @@
+def is_it(page_text):
+    if "VOTRE PATRIMOINE" in page_text:
+        return True
+    return False
--- a/pdf_oralia/pages/recapitulatif.py
+++ b/pdf_oralia/pages/recapitulatif.py
@@ -0,0 +1,34 @@
+import numpy as np
+import pandas as pd
+
+
+def is_it(page_text):
+    if "COMPTE RENDU DE GESTION" in page_text:
+        return True
+    return False
+
+
+def extract(table, additionnal_fields: dict = {}):
+    """Extract "remise commercial" from first page"""
+    extracted = []
+    header = table[0]
+    for row in table[1:]:
+        if "Remise commerciale gérance" in row:
+            r = dict()
+            for i, value in enumerate(row):
+                r[header[i]] = value
+            for k, v in additionnal_fields.items():
+                r[k] = v
+            extracted.append(r)
+
+    return extracted
+
+    # df = pd.DataFrame(table[1:], columns=table[0]).replace("", np.nan)
+    # df = df[
+    #     df["RECAPITULATIF DES OPERATIONS"].str.contains(
+    #         "Remise commerciale gérance", case=False, na=False
+    #     )
+    # ]
+    #
+    # df.columns.values[0] = "Fournisseur"
+    # return df
				`@@ -0,0 +1 @@`
				`from . import charge, locataire, patrimoine, recapitulatif`