73 lines
1.7 KiB
Python
73 lines
1.7 KiB
Python
import re
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
RECAPITULATIF_DES_OPERATION = 1
|
|
|
|
|
|
def is_it(page_text):
|
|
if (
|
|
"RECAPITULATIF DES OPERATIONS" in page_text
|
|
and "COMPTE RENDU DE GESTION" not in page_text
|
|
):
|
|
return True
|
|
return False
|
|
|
|
|
|
def get_lot(txt):
|
|
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
|
|
regex = r"[BSM](\d+)\s-"
|
|
result = re.findall(regex, txt)
|
|
if result:
|
|
return "{:02d}".format(int(result[0]))
|
|
return "*"
|
|
|
|
|
|
def keep_row(row):
|
|
return not any(
|
|
[
|
|
word.lower() in row[RECAPITULATIF_DES_OPERATION].lower()
|
|
for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
|
|
]
|
|
)
|
|
|
|
|
|
def extract(table, additionnal_fields: dict = {}):
|
|
"""Turn table to dictionary with additionnal fields"""
|
|
extracted = []
|
|
header = table[0]
|
|
for row in table[1:]:
|
|
if keep_row(row):
|
|
r = dict()
|
|
for i, value in enumerate(row):
|
|
if header[i] == "":
|
|
r["Fournisseur"] = value
|
|
else:
|
|
r[header[i]] = value
|
|
|
|
for k, v in additionnal_fields.items():
|
|
r[k] = v
|
|
|
|
r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATION])
|
|
|
|
if "honoraire" in row[RECAPITULATIF_DES_OPERATION]:
|
|
r["Fournisseur"] = "IMI GERANCE"
|
|
|
|
extracted.append(r)
|
|
|
|
return extracted
|
|
|
|
|
|
def table2df(tables):
|
|
dfs = []
|
|
for table in tables:
|
|
df = (
|
|
pd.DataFrame.from_records(table)
|
|
.replace("", np.nan)
|
|
.dropna(subset=["Débits", "Crédits"], how="all")
|
|
)
|
|
df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
|
|
dfs.append(df)
|
|
return pd.concat(dfs)
|