pdf_auralia/pdf_oralia/pages/charge.py

73 lines
1.7 KiB
Python

import re
import numpy as np
import pandas as pd
RECAPITULATIF_DES_OPERATION = 1
def is_it(page_text):
if (
"RECAPITULATIF DES OPERATIONS" in page_text
and "COMPTE RENDU DE GESTION" not in page_text
):
return True
return False
def get_lot(txt):
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
regex = r"[BSM](\d+)\s-"
result = re.findall(regex, txt)
if result:
return "{:02d}".format(int(result[0]))
return "*"
def keep_row(row):
return not any(
[
word.lower() in row[RECAPITULATIF_DES_OPERATION].lower()
for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
]
)
def extract(table, additionnal_fields: dict = {}):
"""Turn table to dictionary with additionnal fields"""
extracted = []
header = table[0]
for row in table[1:]:
if keep_row(row):
r = dict()
for i, value in enumerate(row):
if header[i] == "":
r["Fournisseur"] = value
else:
r[header[i]] = value
for k, v in additionnal_fields.items():
r[k] = v
r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATION])
if "honoraire" in row[RECAPITULATIF_DES_OPERATION]:
r["Fournisseur"] = "IMI GERANCE"
extracted.append(r)
return extracted
def table2df(tables):
dfs = []
for table in tables:
df = (
pd.DataFrame.from_records(table)
.replace("", np.nan)
.dropna(subset=["Débits", "Crédits"], how="all")
)
df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
dfs.append(df)
return pd.concat(dfs)