pdf_auralia/pdf_oralia/extract.py

100 lines
3.0 KiB
Python
Raw Permalink Normal View History

import logging
2022-12-18 09:01:19 +00:00
from datetime import datetime
2022-09-27 12:48:41 +00:00
from pathlib import Path
2022-09-27 13:01:14 +00:00
import pdfplumber
2022-09-27 12:48:41 +00:00
from pdf_oralia.pages import charge, locataire, patrimoine, recapitulatif
2022-09-27 12:48:41 +00:00
extract_table_settings = {
2022-09-27 12:48:41 +00:00
"vertical_strategy": "lines",
"horizontal_strategy": "text",
}
2022-12-18 09:01:19 +00:00
def extract_date(page_text):
"""Extract date from a page
:param page_text: text in the page
:return: the extracted date
"""
blocs = page_text.split("\n")
for b in blocs:
if "Lyon le" in b:
words = b.split(" ")
return datetime.strptime(words[-1], "%d/%m/%Y")
def extract_building(page_text, buildings=["bloch", "marietton", "servient"]):
for building in buildings:
if building in page_text.lower():
return building
raise ValueError("Pas d'immeuble trouvé")
def catch_malformed_table(tables):
if len(tables) == 2:
return tables[0] + tables[1]
return tables[0]
def from_pdf(pdf):
"""Build dataframes one about charges and another on loc"""
recapitulatif_tables = []
loc_tables = []
charge_tables = []
patrimoie_tables = []
for page_number, page in enumerate(pdf.pages):
page_text = page.extract_text()
2022-12-18 09:01:19 +00:00
date = extract_date(page_text)
additionnal_fields = {
"immeuble": extract_building(page_text),
"mois": date.strftime("%m"),
"annee": date.strftime("%Y"),
}
if recapitulatif.is_it(page_text):
table = page.extract_tables()[0]
extracted = recapitulatif.extract(table, additionnal_fields)
if extracted:
recapitulatif_tables.append(extracted)
elif locataire.is_it(page_text):
tables = page.extract_tables(extract_table_settings)[1:]
table = catch_malformed_table(tables)
extracted = locataire.extract(table, additionnal_fields)
loc_tables.append(extracted)
elif charge.is_it(page_text):
tables = page.extract_tables(extract_table_settings)[1:]
table = catch_malformed_table(tables)
extracted = charge.extract(table, additionnal_fields)
charge_tables.append(extracted)
elif patrimoine.is_it(page_text):
pass
else:
logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")
df_charge = charge.table2df(recapitulatif_tables + charge_tables)
df_loc = locataire.table2df(loc_tables)
return df_charge, df_loc
2022-09-27 12:48:41 +00:00
2022-09-27 14:01:09 +00:00
def extract_save(pdf_file, dest):
"""Extract charge and locataire for pdf_file and put xlsx file in dest"""
2022-09-27 12:48:41 +00:00
pdf_file = Path(pdf_file)
2022-09-28 03:33:09 +00:00
xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
2022-09-27 12:48:41 +00:00
pdf = pdfplumber.open(pdf_file)
df_charge, df_loc = from_pdf(pdf)
df_charge.to_excel(xls_charge, sheet_name="Charges", index=False)
logging.info(f"{xls_charge} saved")
df_loc.to_excel(xls_locataire, sheet_name="Location", index=False)
logging.info(f"{xls_locataire} saved")