From 223f25130d6324a6074f11d6c0fc0f9f57e649b8 Mon Sep 17 00:00:00 2001
From: Bertrand Benjamin <benjamin.bertrand@opytex.org>
Date: Wed, 28 Jun 2023 10:30:40 +0200
Subject: [PATCH] Feat: type df columns

---
 pdf_oralia/pages/charge.py    | 26 ++++++++++++++++++++------
 pdf_oralia/pages/locataire.py | 19 ++++++++++++++++++-
 pyproject.toml                |  2 +-
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/pdf_oralia/pages/charge.py b/pdf_oralia/pages/charge.py
index d10c4b6..9048422 100644
--- a/pdf_oralia/pages/charge.py
+++ b/pdf_oralia/pages/charge.py
@@ -3,7 +3,20 @@ import re
 import numpy as np
 import pandas as pd
 
-RECAPITULATIF_DES_OPERATION = 1
+RECAPITULATIF_DES_OPERATIONS = 1
+DF_TYPES = {
+    "Fournisseur": str,
+    "RECAPITULATIF DES OPERATIONS": str,
+    "Débits": float,
+    "Crédits": float,
+    "Dont T.V.A.": float,
+    "Locatif": float,
+    "Déductible": float,
+    "immeuble": str,
+    "mois": str,
+    "annee": str,
+    "lot": str,
+}
 
 
 def is_it(page_text):
@@ -27,14 +40,14 @@ def get_lot(txt):
 def keep_row(row):
     return not any(
         [
-            word.lower() in row[RECAPITULATIF_DES_OPERATION].lower()
+            word.lower() in row[RECAPITULATIF_DES_OPERATIONS].lower()
             for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
         ]
     )
 
 
 def extract(table, additionnal_fields: dict = {}):
-    """Turn table to dictionary with additionnal fields"""
+    """Turn table to dictionary with additional fields"""
     extracted = []
     header = table[0]
     for row in table[1:]:
@@ -49,9 +62,9 @@ def extract(table, additionnal_fields: dict = {}):
             for k, v in additionnal_fields.items():
                 r[k] = v
 
-            r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATION])
+            r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATIONS])
 
-            if "honoraire" in row[RECAPITULATIF_DES_OPERATION]:
+            if "honoraire" in row[RECAPITULATIF_DES_OPERATIONS]:
                 r["Fournisseur"] = "IMI GERANCE"
 
             extracted.append(r)
@@ -69,4 +82,5 @@ def table2df(tables):
         )
         df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
         dfs.append(df)
-    return pd.concat(dfs)
+    df = pd.concat(dfs).astype(DF_TYPES, errors="ignore")
+    return df
diff --git a/pdf_oralia/pages/locataire.py b/pdf_oralia/pages/locataire.py
index baba702..4ec4141 100644
--- a/pdf_oralia/pages/locataire.py
+++ b/pdf_oralia/pages/locataire.py
@@ -1,5 +1,22 @@
 import pandas as pd
 
+DF_TYPES = {
+    "Locataires": str,
+    "Période": str,
+    "Loyers": float,
+    "Taxes": float,
+    "Provisions": float,
+    "Divers": str,
+    "Total": float,
+    "Réglés": float,
+    "Impayés": float,
+    "immeuble": str,
+    "mois": str,
+    "annee": str,
+    "Lot": str,
+    "Type": str,
+}
+
 
 def is_it(page_text):
     if "SITUATION DES LOCATAIRES" in page_text:
@@ -131,4 +148,4 @@ def flat_tables(tables):
 def table2df(tables):
     tables = flat_tables(tables)
     joined = join_row(tables)
-    return pd.DataFrame.from_records(joined)
+    return pd.DataFrame.from_records(joined).astype(DF_TYPES, errors="ignore")
diff --git a/pyproject.toml b/pyproject.toml
index 7f6acac..5a60b72 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdf-oralia"
-version = "VERSION_PLACEHOLDER"
+version = "1.dev"
 description = ""
 authors = ["Bertrand Benjamin <benjamin.bertrand@opytex.org>"]
 readme = "README.md"