From 223f25130d6324a6074f11d6c0fc0f9f57e649b8 Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Wed, 28 Jun 2023 10:30:40 +0200 Subject: [PATCH] Feat: type df columns --- pdf_oralia/pages/charge.py | 26 ++++++++++++++++++++------ pdf_oralia/pages/locataire.py | 19 ++++++++++++++++++- pyproject.toml | 2 +- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/pdf_oralia/pages/charge.py b/pdf_oralia/pages/charge.py index d10c4b6..9048422 100644 --- a/pdf_oralia/pages/charge.py +++ b/pdf_oralia/pages/charge.py @@ -3,7 +3,20 @@ import re import numpy as np import pandas as pd -RECAPITULATIF_DES_OPERATION = 1 +RECAPITULATIF_DES_OPERATIONS = 1 +DF_TYPES = { + "Fournisseur": str, + "RECAPITULATIF DES OPERATIONS": str, + "Débits": float, + "Crédits": float, + "Dont T.V.A.": float, + "Locatif": float, + "Déductible": float, + "immeuble": str, + "mois": str, + "annee": str, + "lot": str, +} def is_it(page_text): @@ -27,14 +40,14 @@ def get_lot(txt): def keep_row(row): return not any( [ - word.lower() in row[RECAPITULATIF_DES_OPERATION].lower() + word.lower() in row[RECAPITULATIF_DES_OPERATIONS].lower() for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"] ] ) def extract(table, additionnal_fields: dict = {}): - """Turn table to dictionary with additionnal fields""" + """Turn table to dictionary with additional fields""" extracted = [] header = table[0] for row in table[1:]: @@ -49,9 +62,9 @@ def extract(table, additionnal_fields: dict = {}): for k, v in additionnal_fields.items(): r[k] = v - r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATION]) + r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATIONS]) - if "honoraire" in row[RECAPITULATIF_DES_OPERATION]: + if "honoraire" in row[RECAPITULATIF_DES_OPERATIONS]: r["Fournisseur"] = "IMI GERANCE" extracted.append(r) @@ -69,4 +82,5 @@ def table2df(tables): ) df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill") dfs.append(df) - return pd.concat(dfs) + df = pd.concat(dfs).astype(DF_TYPES, errors="ignore") + return df diff --git a/pdf_oralia/pages/locataire.py b/pdf_oralia/pages/locataire.py index baba702..4ec4141 100644 --- a/pdf_oralia/pages/locataire.py +++ b/pdf_oralia/pages/locataire.py @@ -1,5 +1,22 @@ import pandas as pd +DF_TYPES = { + "Locataires": str, + "Période": str, + "Loyers": float, + "Taxes": float, + "Provisions": float, + "Divers": str, + "Total": float, + "Réglés": float, + "Impayés": float, + "immeuble": str, + "mois": str, + "annee": str, + "Lot": str, + "Type": str, +} + def is_it(page_text): if "SITUATION DES LOCATAIRES" in page_text: @@ -131,4 +148,4 @@ def flat_tables(tables): def table2df(tables): tables = flat_tables(tables) joined = join_row(tables) - return pd.DataFrame.from_records(joined) + return pd.DataFrame.from_records(joined).astype(DF_TYPES, errors="ignore") diff --git a/pyproject.toml b/pyproject.toml index 7f6acac..5a60b72 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdf-oralia" -version = "VERSION_PLACEHOLDER" +version = "1.dev" description = "" authors = ["Bertrand Benjamin "] readme = "README.md"