Feat: join locataire table with pagebreak

This commit is contained in:
Bertrand Benjamin 2022-09-28 15:23:21 +02:00
parent 9a09ae0948
commit c0c550bd59
2 changed files with 39 additions and 6 deletions

View File

@ -14,7 +14,7 @@ charge_table_settings = {
def extract_from_pdf(pdf, charge_dest, location_dest):
"""Build charge_dest and location_dest xlsx file from pdf"""
loc_table = []
loc_tables = []
for page in pdf.pages[1:]:
page_text = page.extract_text()
situation_loc_line = [
@ -22,10 +22,10 @@ def extract_from_pdf(pdf, charge_dest, location_dest):
]
if situation_loc_line:
mois, annee = situation_loc_line[0].split(" ")[-2:]
if loc_table:
loc_table += page.extract_table()[1:]
if loc_tables:
loc_tables.append(page.extract_table()[1:])
else:
loc_table = page.extract_table()
loc_tables.append(page.extract_table())
elif "HONORAIRES" in page_text:
table = page.extract_table(charge_table_settings)
@ -33,7 +33,7 @@ def extract_from_pdf(pdf, charge_dest, location_dest):
df_charge.to_excel(charge_dest, sheet_name="Charges", index=False)
logging.info(f"{charge_dest} saved")
df_loc = extract_situation_loc(loc_table, mois=mois, annee=annee)
df_loc = extract_situation_loc(loc_tables, mois=mois, annee=annee)
df_loc = df_loc.assign()
df_loc.to_excel(location_dest, sheet_name="Location", index=False)
logging.info(f"{location_dest} saved")

View File

@ -1,3 +1,5 @@
import logging
import pandas as pd
@ -18,8 +20,39 @@ def parse_above_loc(content):
return pd.Series(row)
def extract_situation_loc(table, mois, annee):
def join_row(last, next):
row = []
for i in range(len(last)):
if last[i] and next[i]:
row.append(f"{last[i]}\n{next[i]}")
elif last[i]:
row.append(last[i])
elif next[i]:
row.append(next[i])
else:
row.append("")
return row
def join_tables(tables):
joined = tables[0]
for t in tables[1:]:
last_row = joined[-1]
if "Totaux" not in last_row[0]:
first_row = t[0]
joined_row = join_row(last_row, first_row)
joined = joined[:-1] + [joined_row] + t[1:]
else:
joined += t
return joined
def extract_situation_loc(tables, mois, annee):
"""From pdfplumber table extract locataire df"""
table = join_tables(tables)
try:
df = pd.DataFrame(table[1:], columns=table[0])
except IndexError: