pdf_auralia/pdf_oralia/extract.py

135 lines
4.2 KiB
Python

import logging
from datetime import datetime
from pathlib import Path
import pandas as pd
import pdfplumber
from pdf_oralia.pages import charge, locataire, patrimoine
extract_table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "text",
}
def extract_date(page_text):
"""Extract date from a page
:param page_text: text in the page
:return: the extracted date
"""
blocs = page_text.split("\n")
for b in blocs:
if "Lyon le" in b:
words = b.split(" ")
return datetime.strptime(words[-1], "%d/%m/%Y")
def extract_building(page_text, buildings=["bloch", "marietton", "servient"]):
for building in buildings:
if building in page_text.lower():
return building
raise ValueError("Pas d'immeuble trouvé")
def pdf_extract_tables_lines(pdf):
loc_sink = locataire.fsm()
next(loc_sink)
charge_sink = charge.fsm()
next(charge_sink)
patrimoine_sink = patrimoine.fsm()
next(patrimoine_sink)
for page_number, page in enumerate(pdf.pages):
page_text = page.extract_text()
date = extract_date(page_text)
try:
additionnal_fields = {
"immeuble": extract_building(page_text),
"mois": date.strftime("%m"),
"annee": date.strftime("%Y"),
}
except ValueError:
logging.warning(
f"L'immeuble de la page {page_number+1} non identifiable. Page ignorée."
)
continue
table_type = ""
if locataire.is_it(page_text):
table_type = "locataire"
elif charge.is_it(page_text):
table_type = "charge"
elif patrimoine.is_it(page_text):
table_type = "patrimoine"
else:
logging.warning(
f"Type de la page {page_number+1} non identifiable. Page ignorée."
)
continue
for line in page.extract_table(extract_table_settings):
if table_type == "locataire":
res = loc_sink.send(line)
if res:
res.update(additionnal_fields)
yield locataire.Line(**res)
elif table_type == "charge":
res = charge_sink.send(line)
if res:
res.update(additionnal_fields)
yield charge.Line(**res)
elif table_type == "patrimoine":
res = patrimoine_sink.send(line)
if res:
res.update(additionnal_fields)
yield patrimoine.Line(**res)
def from_pdf(pdf_file):
"""Build dataframes one about charges and another on loc"""
pdf = pdfplumber.open(pdf_file)
locataire_lines = []
charge_lines = []
patrimoine_lines = []
for line in pdf_extract_tables_lines(pdf):
if isinstance(line, locataire.Line):
locataire_lines.append(line)
elif isinstance(line, charge.Line):
charge_lines.append(line)
elif isinstance(line, patrimoine.Line):
patrimoine_lines.append(line)
else:
logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")
return {
"charge": pd.DataFrame([c.__dict__ for c in charge_lines]),
"locataire": pd.DataFrame([c.__dict__ for c in locataire_lines]),
"patrimoine": pd.DataFrame([c.__dict__ for c in patrimoine_lines]),
}
def extract_plan(pdf_file, dest):
return {
"charge": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx",
"locataire": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx",
"patrimoine": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx",
}
def extract_save(pdf_file, dest, save=[]):
"""Extract charge and locataire for pdf_file and put xlsx file in dest"""
pdf_file = Path(pdf_file)
xlss = extract_plan(pdf_file, dest)
if save != []:
dfs = from_pdf(pdf_file)
for s in save:
dfs[s].to_excel(xlss[s], sheet_name=s, index=False)
logging.info(f"{xlss[s]} saved")
return {k: v for k, v in xlss.items() if k in save}
return xlss