Feat: split extract

2022-09-27 16:07:06 +02:00 · 2022-09-27 16:07:06 +02:00 · e3cc7d18a2
commit e3cc7d18a2
parent 4031be77c6
3 changed files with 59 additions and 55 deletions
--- a/pdf_oralia/extract.py
+++ b/pdf_oralia/extract.py
@ -1,62 +1,9 @@
 from pathlib import Path

-import click
-import numpy as np
-import pandas as pd
 import pdfplumber

-
-def extract_situation_loc(table):
-    try:
-        df = pd.DataFrame(table[1:], columns=table[0])
-    except IndexError:
-        print(table)
-    rows = []
-    for i, row in df[df["Locataires"] == "Totaux"].iterrows():
-        above_row_loc = df.iloc[i - 1]["Locataires"]
-        up_row = pd.concat(
-            [
-                row,
-                parse_above_loc(above_row_loc),
-            ]
-        )
-
-        rows.append(up_row)
-    df_cleaned = pd.concat(rows, axis=1).T
-    df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True)
-    return df_cleaned
-
-
-def parse_above_loc(content):
-    row = {}
-    try:
-        app, loc = content.split("\n")
-    except ValueError:
-        row["lot"] = ""
-        row["type"] = ""
-        row["locataire"] = content
-
-    else:
-        app_ = app.split(" ")
-        row["lot"] = app_[1]
-        row["type"] = " ".join(app_[2:])
-        row["locataire"] = loc
-    return pd.Series(row)
-
-
-def extract_charge(table):
-    df = (
-        pd.DataFrame(table[1:], columns=table[0])
-        .replace("", np.nan)
-        .dropna(subset=["Débits"])
-    )
-    drop_index = df[
-        df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
-        | df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False)
-    ].index
-    df.drop(drop_index, inplace=True)
-    return df
-
+from .extract_charge import extract_charge
+from .extract_locataire import extract_situation_loc

 charge_table_settings = {
    "vertical_strategy": "lines",
--- a/pdf_oralia/extract_charge.py
+++ b/pdf_oralia/extract_charge.py
@ -0,0 +1,17 @@
+import numpy as np
+import pandas as pd
+
+
+def extract_charge(table):
+    """From pdfplumber table extract the charge dataframe"""
+    df = (
+        pd.DataFrame(table[1:], columns=table[0])
+        .replace("", np.nan)
+        .dropna(subset=["Débits"])
+    )
+    drop_index = df[
+        df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
+        | df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False)
+    ].index
+    df.drop(drop_index, inplace=True)
+    return df
--- a/pdf_oralia/extract_locataire.py
+++ b/pdf_oralia/extract_locataire.py
@ -0,0 +1,40 @@
+import pandas as pd
+
+
+def parse_above_loc(content):
+    row = {}
+    try:
+        app, loc = content.split("\n")
+    except ValueError:
+        row["lot"] = ""
+        row["type"] = ""
+        row["locataire"] = content
+
+    else:
+        app_ = app.split(" ")
+        row["lot"] = app_[1]
+        row["type"] = " ".join(app_[2:])
+        row["locataire"] = loc
+    return pd.Series(row)
+
+
+def extract_situation_loc(table):
+    """From pdfplumber table extract locataire df"""
+    try:
+        df = pd.DataFrame(table[1:], columns=table[0])
+    except IndexError:
+        print(table)
+    rows = []
+    for i, row in df[df["Locataires"] == "Totaux"].iterrows():
+        above_row_loc = df.iloc[i - 1]["Locataires"]
+        up_row = pd.concat(
+            [
+                row,
+                parse_above_loc(above_row_loc),
+            ]
+        )
+
+        rows.append(up_row)
+    df_cleaned = pd.concat(rows, axis=1).T
+    df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True)
+    return df_cleaned