2023-06-27 08:23:02 +00:00
|
|
|
import re
|
|
|
|
|
2023-06-16 06:32:36 +00:00
|
|
|
import numpy as np
|
|
|
|
import pandas as pd
|
|
|
|
|
2023-06-28 08:30:40 +00:00
|
|
|
RECAPITULATIF_DES_OPERATIONS = 1
|
|
|
|
DF_TYPES = {
|
|
|
|
"Fournisseur": str,
|
|
|
|
"RECAPITULATIF DES OPERATIONS": str,
|
|
|
|
"Débits": float,
|
|
|
|
"Crédits": float,
|
|
|
|
"Dont T.V.A.": float,
|
|
|
|
"Locatif": float,
|
|
|
|
"Déductible": float,
|
|
|
|
"immeuble": str,
|
|
|
|
"mois": str,
|
|
|
|
"annee": str,
|
|
|
|
"lot": str,
|
|
|
|
}
|
2023-06-16 06:32:36 +00:00
|
|
|
|
|
|
|
|
|
|
|
def is_it(page_text):
|
|
|
|
if (
|
|
|
|
"RECAPITULATIF DES OPERATIONS" in page_text
|
|
|
|
and "COMPTE RENDU DE GESTION" not in page_text
|
|
|
|
):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2023-06-27 08:23:02 +00:00
|
|
|
def get_lot(txt):
|
2023-06-16 06:32:36 +00:00
|
|
|
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
|
2023-06-28 08:49:36 +00:00
|
|
|
regex = r"[BSM](\d+)(?=\s*-)"
|
2023-06-27 08:23:02 +00:00
|
|
|
result = re.findall(regex, txt)
|
|
|
|
if result:
|
|
|
|
return "{:02d}".format(int(result[0]))
|
|
|
|
return "*"
|
2023-06-16 06:32:36 +00:00
|
|
|
|
|
|
|
|
|
|
|
def keep_row(row):
|
|
|
|
return not any(
|
|
|
|
[
|
2023-06-28 08:30:40 +00:00
|
|
|
word.lower() in row[RECAPITULATIF_DES_OPERATIONS].lower()
|
2023-06-16 06:32:36 +00:00
|
|
|
for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def extract(table, additionnal_fields: dict = {}):
|
2023-06-28 08:30:40 +00:00
|
|
|
"""Turn table to dictionary with additional fields"""
|
2023-06-16 06:32:36 +00:00
|
|
|
extracted = []
|
|
|
|
header = table[0]
|
|
|
|
for row in table[1:]:
|
|
|
|
if keep_row(row):
|
|
|
|
r = dict()
|
|
|
|
for i, value in enumerate(row):
|
|
|
|
if header[i] == "":
|
|
|
|
r["Fournisseur"] = value
|
|
|
|
else:
|
|
|
|
r[header[i]] = value
|
|
|
|
|
|
|
|
for k, v in additionnal_fields.items():
|
|
|
|
r[k] = v
|
|
|
|
|
2023-06-28 08:30:40 +00:00
|
|
|
if "honoraire" in row[RECAPITULATIF_DES_OPERATIONS]:
|
2023-06-16 06:32:36 +00:00
|
|
|
r["Fournisseur"] = "IMI GERANCE"
|
|
|
|
|
|
|
|
extracted.append(r)
|
|
|
|
|
|
|
|
return extracted
|
|
|
|
|
|
|
|
|
|
|
|
def table2df(tables):
|
|
|
|
dfs = []
|
|
|
|
for table in tables:
|
|
|
|
df = (
|
|
|
|
pd.DataFrame.from_records(table)
|
|
|
|
.replace("", np.nan)
|
|
|
|
.dropna(subset=["Débits", "Crédits"], how="all")
|
|
|
|
)
|
|
|
|
df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
|
|
|
|
dfs.append(df)
|
2023-06-28 08:44:56 +00:00
|
|
|
df = pd.concat(dfs)
|
|
|
|
|
|
|
|
df["immeuble"] = df["immeuble"].apply(lambda x: x[0].capitalize())
|
|
|
|
print(df.columns)
|
|
|
|
df["lot"] = df["RECAPITULATIF DES OPERATIONS"].apply(get_lot)
|
|
|
|
return df.astype(DF_TYPES, errors="ignore")
|