pdf_auralia/pdf_oralia/extract.py

55 lines
1.8 KiB
Python
Raw Normal View History

import logging
2022-09-27 12:48:41 +00:00
from pathlib import Path
2022-09-27 13:01:14 +00:00
import pdfplumber
2022-09-27 12:48:41 +00:00
2022-09-27 14:07:06 +00:00
from .extract_charge import extract_charge
from .extract_locataire import extract_situation_loc
2022-09-27 12:48:41 +00:00
charge_table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "text",
}
def extract_from_pdf(pdf, charge_dest, location_dest):
2022-09-27 14:01:09 +00:00
"""Build charge_dest and location_dest xlsx file from pdf"""
loc_tables = []
charge_table = []
for page in pdf.pages[1:]:
page_text = page.extract_text()
2022-09-27 12:48:41 +00:00
situation_loc_line = [
l for l in page_text.split("\n") if "SITUATION DES LOCATAIRES" in l
2022-09-27 12:48:41 +00:00
]
if situation_loc_line:
mois, annee = situation_loc_line[0].split(" ")[-2:]
if loc_tables:
loc_tables.append(page.extract_table()[1:])
2022-09-27 12:48:41 +00:00
else:
loc_tables.append(page.extract_table())
2022-09-27 12:48:41 +00:00
elif "RECAPITULATIF DES OPERATIONS" in page_text:
if charge_table:
charge_table += page.extract_table(charge_table_settings)[1:]
else:
charge_table = page.extract_table(charge_table_settings)
df_charge = extract_charge(charge_table)
df_charge.to_excel(charge_dest, sheet_name="Charges", index=False)
logging.info(f"{charge_dest} saved")
2022-09-27 12:48:41 +00:00
df_loc = extract_situation_loc(loc_tables, mois=mois, annee=annee)
2022-09-27 19:14:27 +00:00
df_loc = df_loc.assign()
2022-09-27 12:48:41 +00:00
df_loc.to_excel(location_dest, sheet_name="Location", index=False)
logging.info(f"{location_dest} saved")
2022-09-27 12:48:41 +00:00
2022-09-27 14:01:09 +00:00
def extract_save(pdf_file, dest):
"""Extract charge and locataire for pdf_file and put xlsx file in dest"""
2022-09-27 12:48:41 +00:00
pdf_file = Path(pdf_file)
2022-09-28 03:33:09 +00:00
xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
2022-09-27 12:48:41 +00:00
pdf = pdfplumber.open(pdf_file)
extract_from_pdf(pdf, xls_charge, xls_locataire)