pdf_auralia/pdf_oralia/pages/charge.py

87 lines
2.1 KiB
Python
Raw Normal View History

2023-06-27 08:23:02 +00:00
import re
import numpy as np
import pandas as pd
2023-06-28 08:30:40 +00:00
RECAPITULATIF_DES_OPERATIONS = 1
DF_TYPES = {
"Fournisseur": str,
"RECAPITULATIF DES OPERATIONS": str,
"Débits": float,
"Crédits": float,
"Dont T.V.A.": float,
"Locatif": float,
"Déductible": float,
"immeuble": str,
"mois": str,
"annee": str,
"lot": str,
}
def is_it(page_text):
if (
"RECAPITULATIF DES OPERATIONS" in page_text
and "COMPTE RENDU DE GESTION" not in page_text
):
return True
return False
2023-06-27 08:23:02 +00:00
def get_lot(txt):
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
2023-06-27 08:23:02 +00:00
regex = r"[BSM](\d+)\s-"
result = re.findall(regex, txt)
if result:
return "{:02d}".format(int(result[0]))
return "*"
def keep_row(row):
return not any(
[
2023-06-28 08:30:40 +00:00
word.lower() in row[RECAPITULATIF_DES_OPERATIONS].lower()
for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
]
)
def extract(table, additionnal_fields: dict = {}):
2023-06-28 08:30:40 +00:00
"""Turn table to dictionary with additional fields"""
extracted = []
header = table[0]
for row in table[1:]:
if keep_row(row):
r = dict()
for i, value in enumerate(row):
if header[i] == "":
r["Fournisseur"] = value
else:
r[header[i]] = value
for k, v in additionnal_fields.items():
r[k] = v
2023-06-28 08:30:40 +00:00
r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATIONS])
2023-06-28 08:30:40 +00:00
if "honoraire" in row[RECAPITULATIF_DES_OPERATIONS]:
r["Fournisseur"] = "IMI GERANCE"
extracted.append(r)
return extracted
def table2df(tables):
dfs = []
for table in tables:
df = (
pd.DataFrame.from_records(table)
.replace("", np.nan)
.dropna(subset=["Débits", "Crédits"], how="all")
)
df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
dfs.append(df)
2023-06-28 08:30:40 +00:00
df = pd.concat(dfs).astype(DF_TYPES, errors="ignore")
return df