From e3cc7d18a2c4218b878c88ede124361adb77720e Mon Sep 17 00:00:00 2001
From: Bertrand Benjamin <benjamin.bertrand@opytex.org>
Date: Tue, 27 Sep 2022 16:07:06 +0200
Subject: [PATCH] Feat: split extract

---
 pdf_oralia/extract.py           | 57 ++-------------------------------
 pdf_oralia/extract_charge.py    | 17 ++++++++++
 pdf_oralia/extract_locataire.py | 40 +++++++++++++++++++++++
 3 files changed, 59 insertions(+), 55 deletions(-)
 create mode 100644 pdf_oralia/extract_charge.py
 create mode 100644 pdf_oralia/extract_locataire.py

diff --git a/pdf_oralia/extract.py b/pdf_oralia/extract.py
index 69593ee..0058a10 100644
--- a/pdf_oralia/extract.py
+++ b/pdf_oralia/extract.py
@@ -1,62 +1,9 @@
 from pathlib import Path
 
-import click
-import numpy as np
-import pandas as pd
 import pdfplumber
 
-
-def extract_situation_loc(table):
-    try:
-        df = pd.DataFrame(table[1:], columns=table[0])
-    except IndexError:
-        print(table)
-    rows = []
-    for i, row in df[df["Locataires"] == "Totaux"].iterrows():
-        above_row_loc = df.iloc[i - 1]["Locataires"]
-        up_row = pd.concat(
-            [
-                row,
-                parse_above_loc(above_row_loc),
-            ]
-        )
-
-        rows.append(up_row)
-    df_cleaned = pd.concat(rows, axis=1).T
-    df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True)
-    return df_cleaned
-
-
-def parse_above_loc(content):
-    row = {}
-    try:
-        app, loc = content.split("\n")
-    except ValueError:
-        row["lot"] = ""
-        row["type"] = ""
-        row["locataire"] = content
-
-    else:
-        app_ = app.split(" ")
-        row["lot"] = app_[1]
-        row["type"] = " ".join(app_[2:])
-        row["locataire"] = loc
-    return pd.Series(row)
-
-
-def extract_charge(table):
-    df = (
-        pd.DataFrame(table[1:], columns=table[0])
-        .replace("", np.nan)
-        .dropna(subset=["Débits"])
-    )
-    drop_index = df[
-        df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
-        | df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False)
-    ].index
-    df.drop(drop_index, inplace=True)
-    return df
-
+from .extract_charge import extract_charge
+from .extract_locataire import extract_situation_loc
 
 charge_table_settings = {
     "vertical_strategy": "lines",
diff --git a/pdf_oralia/extract_charge.py b/pdf_oralia/extract_charge.py
new file mode 100644
index 0000000..4705104
--- /dev/null
+++ b/pdf_oralia/extract_charge.py
@@ -0,0 +1,17 @@
+import numpy as np
+import pandas as pd
+
+
+def extract_charge(table):
+    """From pdfplumber table extract the charge dataframe"""
+    df = (
+        pd.DataFrame(table[1:], columns=table[0])
+        .replace("", np.nan)
+        .dropna(subset=["Débits"])
+    )
+    drop_index = df[
+        df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
+        | df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False)
+    ].index
+    df.drop(drop_index, inplace=True)
+    return df
diff --git a/pdf_oralia/extract_locataire.py b/pdf_oralia/extract_locataire.py
new file mode 100644
index 0000000..8053d42
--- /dev/null
+++ b/pdf_oralia/extract_locataire.py
@@ -0,0 +1,40 @@
+import pandas as pd
+
+
+def parse_above_loc(content):
+    row = {}
+    try:
+        app, loc = content.split("\n")
+    except ValueError:
+        row["lot"] = ""
+        row["type"] = ""
+        row["locataire"] = content
+
+    else:
+        app_ = app.split(" ")
+        row["lot"] = app_[1]
+        row["type"] = " ".join(app_[2:])
+        row["locataire"] = loc
+    return pd.Series(row)
+
+
+def extract_situation_loc(table):
+    """From pdfplumber table extract locataire df"""
+    try:
+        df = pd.DataFrame(table[1:], columns=table[0])
+    except IndexError:
+        print(table)
+    rows = []
+    for i, row in df[df["Locataires"] == "Totaux"].iterrows():
+        above_row_loc = df.iloc[i - 1]["Locataires"]
+        up_row = pd.concat(
+            [
+                row,
+                parse_above_loc(above_row_loc),
+            ]
+        )
+
+        rows.append(up_row)
+    df_cleaned = pd.concat(rows, axis=1).T
+    df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True)
+    return df_cleaned