Feat: marche avec les pdfs tous ensembles

This commit is contained in:
2023-06-16 08:32:36 +02:00
parent 1afb2a32ab
commit 8a55e6e2cc
9 changed files with 303 additions and 216 deletions

View File

@@ -0,0 +1 @@
from . import charge, locataire, patrimoine, recapitulatif

View File

@@ -0,0 +1,72 @@
import numpy as np
import pandas as pd
RECAPITULATIF_DES_OPERATION = 1
def is_it(page_text):
if (
"RECAPITULATIF DES OPERATIONS" in page_text
and "COMPTE RENDU DE GESTION" not in page_text
):
return True
return False
def get_lot(x):
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
if x[:2].isdigit():
return x[:2]
if x[:1].isdigit():
return "0" + x[:1]
if x[:2] == "PC":
return "PC"
return ""
def keep_row(row):
return not any(
[
word.lower() in row[RECAPITULATIF_DES_OPERATION].lower()
for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
]
)
def extract(table, additionnal_fields: dict = {}):
"""Turn table to dictionary with additionnal fields"""
extracted = []
header = table[0]
for row in table[1:]:
if keep_row(row):
r = dict()
for i, value in enumerate(row):
if header[i] == "":
r["Fournisseur"] = value
else:
r[header[i]] = value
for k, v in additionnal_fields.items():
r[k] = v
r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATION])
if "honoraire" in row[RECAPITULATIF_DES_OPERATION]:
r["Fournisseur"] = "IMI GERANCE"
extracted.append(r)
return extracted
def table2df(tables):
dfs = []
for table in tables:
df = (
pd.DataFrame.from_records(table)
.replace("", np.nan)
.dropna(subset=["Débits", "Crédits"], how="all")
)
df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
dfs.append(df)
return pd.concat(dfs)

View File

@@ -0,0 +1,134 @@
import pandas as pd
def is_it(page_text):
if "SITUATION DES LOCATAIRES" in page_text:
return True
return False
def is_drop(row):
if "totaux" in row[0].lower():
return True
if not any(row):
return True
return False
def extract(table, additionnal_fields: dict = {}):
"""Turn table to dictionary with additionnal fields"""
extracted = []
header = table[0]
for row in table[1:]:
if not is_drop(row):
r = dict()
for i, value in enumerate(row):
if header[i] != "":
r[header[i]] = value
for k, v in additionnal_fields.items():
r[k] = v
extracted.append(r)
return extracted
def join_row(last, next):
row = {}
for key in last:
if last[key] == next[key]:
row[key] = last[key]
elif last[key] and next[key]:
row[key] = f"{last[key]}\n{next[key]}"
elif last[key]:
row[key] = last[key]
elif next[key]:
row[key] = next[key]
else:
row[key] = ""
return row
def join_tables(tables):
joined = tables[0]
for t in tables[1:]:
last_row = joined[-1]
if "totaux" not in last_row["Locataires"].lower():
first_row = t[0]
joined_row = join_row(last_row, first_row)
joined = joined[:-1] + [joined_row] + t[1:]
else:
joined += t
return joined
def parse_lot(string):
words = string.split(" ")
return {"Lot": words[1], "Type": " ".join(words[2:])}
def join_row(table):
joined = []
for row in table:
if row["Locataires"].startswith("Lot"):
row.update(parse_lot(row["Locataires"]))
row["Locataires"] = ""
joined.append(row)
elif row["Locataires"] == "Rappel de Loyer":
last_row = joined[-1]
row.update(
{
"Lot": last_row["Lot"],
"Type": last_row["Type"],
"Locataires": last_row["Locataires"],
"Divers": "Rappel de Loyer",
}
)
joined.append(row)
elif row["Locataires"]:
last_row = joined.pop()
row_name = row["Locataires"].replace("\n", " ")
row.update({k: v for k, v in last_row.items() if v})
row["Locataires"] = last_row["Locataires"] + " " + row_name
joined.append(row)
else:
if row["Période"].startswith("Solde"):
last_row = joined.pop()
row.update(
{
"Lot": last_row["Lot"],
"Type": last_row["Type"],
"Locataires": last_row["Locataires"],
}
)
joined.append(row)
elif row["Période"].startswith("Du"):
last_row = joined[-1]
row.update(
{
"Lot": last_row["Lot"],
"Type": last_row["Type"],
"Locataires": last_row["Locataires"],
}
)
joined.append(row)
else:
print(row)
return joined
def flat_tables(tables):
tables_flat = []
for table in tables:
tables_flat.extend(table)
return tables_flat
def table2df(tables):
tables = flat_tables(tables)
joined = join_row(tables)
return pd.DataFrame.from_records(joined)

View File

@@ -0,0 +1,4 @@
def is_it(page_text):
if "VOTRE PATRIMOINE" in page_text:
return True
return False

View File

@@ -0,0 +1,34 @@
import numpy as np
import pandas as pd
def is_it(page_text):
if "COMPTE RENDU DE GESTION" in page_text:
return True
return False
def extract(table, additionnal_fields: dict = {}):
"""Extract "remise commercial" from first page"""
extracted = []
header = table[0]
for row in table[1:]:
if "Remise commerciale gérance" in row:
r = dict()
for i, value in enumerate(row):
r[header[i]] = value
for k, v in additionnal_fields.items():
r[k] = v
extracted.append(r)
return extracted
# df = pd.DataFrame(table[1:], columns=table[0]).replace("", np.nan)
# df = df[
# df["RECAPITULATIF DES OPERATIONS"].str.contains(
# "Remise commerciale gérance", case=False, na=False
# )
# ]
#
# df.columns.values[0] = "Fournisseur"
# return df