Feat: script command all works recursively

This commit is contained in:
Bertrand Benjamin 2025-02-26 09:02:33 +01:00
parent 4ee78a7e7b
commit 3c18bd5d81
2 changed files with 75 additions and 35 deletions

View File

@ -41,15 +41,20 @@ def pdf_extract_tables_lines(pdf):
patrimoine_sink = patrimoine.fsm() patrimoine_sink = patrimoine.fsm()
next(patrimoine_sink) next(patrimoine_sink)
page_number = 1 for page_number, page in enumerate(pdf.pages):
for page in pdf.pages:
page_text = page.extract_text() page_text = page.extract_text()
date = extract_date(page_text) date = extract_date(page_text)
try:
additionnal_fields = { additionnal_fields = {
"immeuble": extract_building(page_text), "immeuble": extract_building(page_text),
"mois": date.strftime("%m"), "mois": date.strftime("%m"),
"annee": date.strftime("%Y"), "annee": date.strftime("%Y"),
} }
except ValueError:
logging.warning(
f"L'immeuble de la page {page_number+1} non identifiable. Page ignorée."
)
continue
table_type = "" table_type = ""
if locataire.is_it(page_text): if locataire.is_it(page_text):
table_type = "locataire" table_type = "locataire"
@ -58,7 +63,10 @@ def pdf_extract_tables_lines(pdf):
elif patrimoine.is_it(page_text): elif patrimoine.is_it(page_text):
table_type = "patrimoine" table_type = "patrimoine"
else: else:
logging.warning(f"Page {page_number} non reconnu. Page ignorée.") logging.warning(
f"Type de la page {page_number+1} non identifiable. Page ignorée."
)
continue
for line in page.extract_table(extract_table_settings): for line in page.extract_table(extract_table_settings):
if table_type == "locataire": if table_type == "locataire":
@ -78,8 +86,6 @@ def pdf_extract_tables_lines(pdf):
res.update(additionnal_fields) res.update(additionnal_fields)
yield patrimoine.Line(**res) yield patrimoine.Line(**res)
page_number += 1
def from_pdf(pdf_file): def from_pdf(pdf_file):
"""Build dataframes one about charges and another on loc""" """Build dataframes one about charges and another on loc"""
@ -97,25 +103,32 @@ def from_pdf(pdf_file):
else: else:
logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.") logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")
return ( return {
pd.DataFrame([c.__dict__ for c in charge_lines]), "charge": pd.DataFrame([c.__dict__ for c in charge_lines]),
pd.DataFrame([c.__dict__ for c in locataire_lines]), "locataire": pd.DataFrame([c.__dict__ for c in locataire_lines]),
pd.DataFrame([c.__dict__ for c in patrimoine_lines]), "patrimoine": pd.DataFrame([c.__dict__ for c in patrimoine_lines]),
) }
def extract_save(pdf_file, dest): def extract_plan(pdf_file, dest):
return {
"charge": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx",
"locataire": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx",
"patrimoine": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx",
}
def extract_save(pdf_file, dest, save=[]):
"""Extract charge and locataire for pdf_file and put xlsx file in dest""" """Extract charge and locataire for pdf_file and put xlsx file in dest"""
pdf_file = Path(pdf_file) pdf_file = Path(pdf_file)
xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx" xlss = extract_plan(pdf_file, dest)
xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
xls_patrimoine = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx"
df_charge, df_loc, df_patrimoine = from_pdf(pdf_file) if save != []:
dfs = from_pdf(pdf_file)
df_charge.to_excel(xls_charge, sheet_name="Charges", index=False) for s in save:
logging.info(f"{xls_charge} saved") dfs[s].to_excel(xlss[s], sheet_name=s, index=False)
df_loc.to_excel(xls_locataire, sheet_name="Location", index=False) logging.info(f"{xlss[s]} saved")
logging.info(f"{xls_locataire} saved") return {k: v for k, v in xlss.items() if k in save}
df_patrimoine.to_excel(xls_patrimoine, sheet_name="Patrimoine", index=False)
logging.info(f"{xls_patrimoine} saved") return xlss

View File

@ -4,7 +4,7 @@ from pathlib import Path
import click import click
from .extract import extract_save from .extract import extract_save, extract_plan
from .join import join_excel from .join import join_excel
@ -51,18 +51,45 @@ def on(pdf_file, dest):
@extract.command() @extract.command()
@click.option("--src", help="Tous les fichiers dans folder", default="./") @click.option(
"--src", help="Tous les fichiers dans folder (de façon récursive)", default="./"
)
@click.option("--dest", help="Où mettre les fichiers produits", default="./") @click.option("--dest", help="Où mettre les fichiers produits", default="./")
def all(src, dest): @click.option(
p = Path(src) "--only-plan",
help="Ne produit rien mais indique les changements",
default=False,
is_flag=True,
)
@click.option(
"--force",
help="Écrase les fichiers produits précédemment",
default=False,
is_flag=True,
)
def all(src, dest, force, only_plan):
src_path = Path(src)
d = Path(dest) dest = Path(dest)
d.mkdir(exist_ok=True) dest.mkdir(exist_ok=True)
pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)] for pdf_file in src_path.rglob("**/*.pdf"):
for pdf_file in pdf_files: relative_path = pdf_file.relative_to(src_path)
files_dest = dest / relative_path.parent
logging.info(f"Found {pdf_file}") logging.info(f"Found {pdf_file}")
extract_save(pdf_file, d)
plan_dest = extract_plan(pdf_file, files_dest)
save = []
for k, p in plan_dest.items():
if not p.exists() or force:
save.append(k)
if only_plan:
for s in save:
logging.info(f"Planing to create {plan_dest[s]}")
else:
files_dest.mkdir(parents=True, exist_ok=True)
extract_save(pdf_file, files_dest, save)
@main.command() @main.command()