Feat: marche avec les pdfs tous ensembles
This commit is contained in:
1
pdf_oralia/pages/__init__.py
Normal file
1
pdf_oralia/pages/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from . import charge, locataire, patrimoine, recapitulatif
|
||||
72
pdf_oralia/pages/charge.py
Normal file
72
pdf_oralia/pages/charge.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
RECAPITULATIF_DES_OPERATION = 1
|
||||
|
||||
|
||||
def is_it(page_text):
|
||||
if (
|
||||
"RECAPITULATIF DES OPERATIONS" in page_text
|
||||
and "COMPTE RENDU DE GESTION" not in page_text
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def get_lot(x):
|
||||
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
|
||||
if x[:2].isdigit():
|
||||
return x[:2]
|
||||
if x[:1].isdigit():
|
||||
return "0" + x[:1]
|
||||
if x[:2] == "PC":
|
||||
return "PC"
|
||||
return ""
|
||||
|
||||
|
||||
def keep_row(row):
|
||||
return not any(
|
||||
[
|
||||
word.lower() in row[RECAPITULATIF_DES_OPERATION].lower()
|
||||
for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def extract(table, additionnal_fields: dict = {}):
|
||||
"""Turn table to dictionary with additionnal fields"""
|
||||
extracted = []
|
||||
header = table[0]
|
||||
for row in table[1:]:
|
||||
if keep_row(row):
|
||||
r = dict()
|
||||
for i, value in enumerate(row):
|
||||
if header[i] == "":
|
||||
r["Fournisseur"] = value
|
||||
else:
|
||||
r[header[i]] = value
|
||||
|
||||
for k, v in additionnal_fields.items():
|
||||
r[k] = v
|
||||
|
||||
r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATION])
|
||||
|
||||
if "honoraire" in row[RECAPITULATIF_DES_OPERATION]:
|
||||
r["Fournisseur"] = "IMI GERANCE"
|
||||
|
||||
extracted.append(r)
|
||||
|
||||
return extracted
|
||||
|
||||
|
||||
def table2df(tables):
|
||||
dfs = []
|
||||
for table in tables:
|
||||
df = (
|
||||
pd.DataFrame.from_records(table)
|
||||
.replace("", np.nan)
|
||||
.dropna(subset=["Débits", "Crédits"], how="all")
|
||||
)
|
||||
df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
|
||||
dfs.append(df)
|
||||
return pd.concat(dfs)
|
||||
134
pdf_oralia/pages/locataire.py
Normal file
134
pdf_oralia/pages/locataire.py
Normal file
@@ -0,0 +1,134 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def is_it(page_text):
|
||||
if "SITUATION DES LOCATAIRES" in page_text:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_drop(row):
|
||||
if "totaux" in row[0].lower():
|
||||
return True
|
||||
if not any(row):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def extract(table, additionnal_fields: dict = {}):
|
||||
"""Turn table to dictionary with additionnal fields"""
|
||||
extracted = []
|
||||
header = table[0]
|
||||
for row in table[1:]:
|
||||
if not is_drop(row):
|
||||
r = dict()
|
||||
for i, value in enumerate(row):
|
||||
if header[i] != "":
|
||||
r[header[i]] = value
|
||||
for k, v in additionnal_fields.items():
|
||||
r[k] = v
|
||||
extracted.append(r)
|
||||
return extracted
|
||||
|
||||
|
||||
def join_row(last, next):
|
||||
row = {}
|
||||
for key in last:
|
||||
if last[key] == next[key]:
|
||||
row[key] = last[key]
|
||||
elif last[key] and next[key]:
|
||||
row[key] = f"{last[key]}\n{next[key]}"
|
||||
elif last[key]:
|
||||
row[key] = last[key]
|
||||
elif next[key]:
|
||||
row[key] = next[key]
|
||||
else:
|
||||
row[key] = ""
|
||||
return row
|
||||
|
||||
|
||||
def join_tables(tables):
|
||||
joined = tables[0]
|
||||
|
||||
for t in tables[1:]:
|
||||
last_row = joined[-1]
|
||||
if "totaux" not in last_row["Locataires"].lower():
|
||||
first_row = t[0]
|
||||
joined_row = join_row(last_row, first_row)
|
||||
joined = joined[:-1] + [joined_row] + t[1:]
|
||||
else:
|
||||
joined += t
|
||||
|
||||
return joined
|
||||
|
||||
|
||||
def parse_lot(string):
|
||||
words = string.split(" ")
|
||||
return {"Lot": words[1], "Type": " ".join(words[2:])}
|
||||
|
||||
|
||||
def join_row(table):
|
||||
joined = []
|
||||
for row in table:
|
||||
if row["Locataires"].startswith("Lot"):
|
||||
row.update(parse_lot(row["Locataires"]))
|
||||
row["Locataires"] = ""
|
||||
joined.append(row)
|
||||
elif row["Locataires"] == "Rappel de Loyer":
|
||||
last_row = joined[-1]
|
||||
row.update(
|
||||
{
|
||||
"Lot": last_row["Lot"],
|
||||
"Type": last_row["Type"],
|
||||
"Locataires": last_row["Locataires"],
|
||||
"Divers": "Rappel de Loyer",
|
||||
}
|
||||
)
|
||||
joined.append(row)
|
||||
|
||||
elif row["Locataires"]:
|
||||
last_row = joined.pop()
|
||||
row_name = row["Locataires"].replace("\n", " ")
|
||||
row.update({k: v for k, v in last_row.items() if v})
|
||||
row["Locataires"] = last_row["Locataires"] + " " + row_name
|
||||
joined.append(row)
|
||||
|
||||
else:
|
||||
if row["Période"].startswith("Solde"):
|
||||
last_row = joined.pop()
|
||||
row.update(
|
||||
{
|
||||
"Lot": last_row["Lot"],
|
||||
"Type": last_row["Type"],
|
||||
"Locataires": last_row["Locataires"],
|
||||
}
|
||||
)
|
||||
joined.append(row)
|
||||
|
||||
elif row["Période"].startswith("Du"):
|
||||
last_row = joined[-1]
|
||||
row.update(
|
||||
{
|
||||
"Lot": last_row["Lot"],
|
||||
"Type": last_row["Type"],
|
||||
"Locataires": last_row["Locataires"],
|
||||
}
|
||||
)
|
||||
joined.append(row)
|
||||
else:
|
||||
print(row)
|
||||
|
||||
return joined
|
||||
|
||||
|
||||
def flat_tables(tables):
|
||||
tables_flat = []
|
||||
for table in tables:
|
||||
tables_flat.extend(table)
|
||||
return tables_flat
|
||||
|
||||
|
||||
def table2df(tables):
|
||||
tables = flat_tables(tables)
|
||||
joined = join_row(tables)
|
||||
return pd.DataFrame.from_records(joined)
|
||||
4
pdf_oralia/pages/patrimoine.py
Normal file
4
pdf_oralia/pages/patrimoine.py
Normal file
@@ -0,0 +1,4 @@
|
||||
def is_it(page_text):
|
||||
if "VOTRE PATRIMOINE" in page_text:
|
||||
return True
|
||||
return False
|
||||
34
pdf_oralia/pages/recapitulatif.py
Normal file
34
pdf_oralia/pages/recapitulatif.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def is_it(page_text):
|
||||
if "COMPTE RENDU DE GESTION" in page_text:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def extract(table, additionnal_fields: dict = {}):
|
||||
"""Extract "remise commercial" from first page"""
|
||||
extracted = []
|
||||
header = table[0]
|
||||
for row in table[1:]:
|
||||
if "Remise commerciale gérance" in row:
|
||||
r = dict()
|
||||
for i, value in enumerate(row):
|
||||
r[header[i]] = value
|
||||
for k, v in additionnal_fields.items():
|
||||
r[k] = v
|
||||
extracted.append(r)
|
||||
|
||||
return extracted
|
||||
|
||||
# df = pd.DataFrame(table[1:], columns=table[0]).replace("", np.nan)
|
||||
# df = df[
|
||||
# df["RECAPITULATIF DES OPERATIONS"].str.contains(
|
||||
# "Remise commerciale gérance", case=False, na=False
|
||||
# )
|
||||
# ]
|
||||
#
|
||||
# df.columns.values[0] = "Fournisseur"
|
||||
# return df
|
||||
Reference in New Issue
Block a user