Feat: script command all works recursively
This commit is contained in:
parent
4ee78a7e7b
commit
3c18bd5d81
@ -41,15 +41,20 @@ def pdf_extract_tables_lines(pdf):
|
|||||||
patrimoine_sink = patrimoine.fsm()
|
patrimoine_sink = patrimoine.fsm()
|
||||||
next(patrimoine_sink)
|
next(patrimoine_sink)
|
||||||
|
|
||||||
page_number = 1
|
for page_number, page in enumerate(pdf.pages):
|
||||||
for page in pdf.pages:
|
|
||||||
page_text = page.extract_text()
|
page_text = page.extract_text()
|
||||||
date = extract_date(page_text)
|
date = extract_date(page_text)
|
||||||
|
try:
|
||||||
additionnal_fields = {
|
additionnal_fields = {
|
||||||
"immeuble": extract_building(page_text),
|
"immeuble": extract_building(page_text),
|
||||||
"mois": date.strftime("%m"),
|
"mois": date.strftime("%m"),
|
||||||
"annee": date.strftime("%Y"),
|
"annee": date.strftime("%Y"),
|
||||||
}
|
}
|
||||||
|
except ValueError:
|
||||||
|
logging.warning(
|
||||||
|
f"L'immeuble de la page {page_number+1} non identifiable. Page ignorée."
|
||||||
|
)
|
||||||
|
continue
|
||||||
table_type = ""
|
table_type = ""
|
||||||
if locataire.is_it(page_text):
|
if locataire.is_it(page_text):
|
||||||
table_type = "locataire"
|
table_type = "locataire"
|
||||||
@ -58,7 +63,10 @@ def pdf_extract_tables_lines(pdf):
|
|||||||
elif patrimoine.is_it(page_text):
|
elif patrimoine.is_it(page_text):
|
||||||
table_type = "patrimoine"
|
table_type = "patrimoine"
|
||||||
else:
|
else:
|
||||||
logging.warning(f"Page {page_number} non reconnu. Page ignorée.")
|
logging.warning(
|
||||||
|
f"Type de la page {page_number+1} non identifiable. Page ignorée."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
for line in page.extract_table(extract_table_settings):
|
for line in page.extract_table(extract_table_settings):
|
||||||
if table_type == "locataire":
|
if table_type == "locataire":
|
||||||
@ -78,8 +86,6 @@ def pdf_extract_tables_lines(pdf):
|
|||||||
res.update(additionnal_fields)
|
res.update(additionnal_fields)
|
||||||
yield patrimoine.Line(**res)
|
yield patrimoine.Line(**res)
|
||||||
|
|
||||||
page_number += 1
|
|
||||||
|
|
||||||
|
|
||||||
def from_pdf(pdf_file):
|
def from_pdf(pdf_file):
|
||||||
"""Build dataframes one about charges and another on loc"""
|
"""Build dataframes one about charges and another on loc"""
|
||||||
@ -97,25 +103,32 @@ def from_pdf(pdf_file):
|
|||||||
else:
|
else:
|
||||||
logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")
|
logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")
|
||||||
|
|
||||||
return (
|
return {
|
||||||
pd.DataFrame([c.__dict__ for c in charge_lines]),
|
"charge": pd.DataFrame([c.__dict__ for c in charge_lines]),
|
||||||
pd.DataFrame([c.__dict__ for c in locataire_lines]),
|
"locataire": pd.DataFrame([c.__dict__ for c in locataire_lines]),
|
||||||
pd.DataFrame([c.__dict__ for c in patrimoine_lines]),
|
"patrimoine": pd.DataFrame([c.__dict__ for c in patrimoine_lines]),
|
||||||
)
|
}
|
||||||
|
|
||||||
|
|
||||||
def extract_save(pdf_file, dest):
|
def extract_plan(pdf_file, dest):
|
||||||
|
return {
|
||||||
|
"charge": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx",
|
||||||
|
"locataire": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx",
|
||||||
|
"patrimoine": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_save(pdf_file, dest, save=[]):
|
||||||
"""Extract charge and locataire for pdf_file and put xlsx file in dest"""
|
"""Extract charge and locataire for pdf_file and put xlsx file in dest"""
|
||||||
pdf_file = Path(pdf_file)
|
pdf_file = Path(pdf_file)
|
||||||
xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
|
xlss = extract_plan(pdf_file, dest)
|
||||||
xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
|
|
||||||
xls_patrimoine = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx"
|
|
||||||
|
|
||||||
df_charge, df_loc, df_patrimoine = from_pdf(pdf_file)
|
if save != []:
|
||||||
|
dfs = from_pdf(pdf_file)
|
||||||
|
|
||||||
df_charge.to_excel(xls_charge, sheet_name="Charges", index=False)
|
for s in save:
|
||||||
logging.info(f"{xls_charge} saved")
|
dfs[s].to_excel(xlss[s], sheet_name=s, index=False)
|
||||||
df_loc.to_excel(xls_locataire, sheet_name="Location", index=False)
|
logging.info(f"{xlss[s]} saved")
|
||||||
logging.info(f"{xls_locataire} saved")
|
return {k: v for k, v in xlss.items() if k in save}
|
||||||
df_patrimoine.to_excel(xls_patrimoine, sheet_name="Patrimoine", index=False)
|
|
||||||
logging.info(f"{xls_patrimoine} saved")
|
return xlss
|
||||||
|
@ -4,7 +4,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import click
|
import click
|
||||||
|
|
||||||
from .extract import extract_save
|
from .extract import extract_save, extract_plan
|
||||||
from .join import join_excel
|
from .join import join_excel
|
||||||
|
|
||||||
|
|
||||||
@ -51,18 +51,45 @@ def on(pdf_file, dest):
|
|||||||
|
|
||||||
|
|
||||||
@extract.command()
|
@extract.command()
|
||||||
@click.option("--src", help="Tous les fichiers dans folder", default="./")
|
@click.option(
|
||||||
|
"--src", help="Tous les fichiers dans folder (de façon récursive)", default="./"
|
||||||
|
)
|
||||||
@click.option("--dest", help="Où mettre les fichiers produits", default="./")
|
@click.option("--dest", help="Où mettre les fichiers produits", default="./")
|
||||||
def all(src, dest):
|
@click.option(
|
||||||
p = Path(src)
|
"--only-plan",
|
||||||
|
help="Ne produit rien mais indique les changements",
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
)
|
||||||
|
@click.option(
|
||||||
|
"--force",
|
||||||
|
help="Écrase les fichiers produits précédemment",
|
||||||
|
default=False,
|
||||||
|
is_flag=True,
|
||||||
|
)
|
||||||
|
def all(src, dest, force, only_plan):
|
||||||
|
src_path = Path(src)
|
||||||
|
|
||||||
d = Path(dest)
|
dest = Path(dest)
|
||||||
d.mkdir(exist_ok=True)
|
dest.mkdir(exist_ok=True)
|
||||||
|
|
||||||
pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)]
|
for pdf_file in src_path.rglob("**/*.pdf"):
|
||||||
for pdf_file in pdf_files:
|
relative_path = pdf_file.relative_to(src_path)
|
||||||
|
files_dest = dest / relative_path.parent
|
||||||
logging.info(f"Found {pdf_file}")
|
logging.info(f"Found {pdf_file}")
|
||||||
extract_save(pdf_file, d)
|
|
||||||
|
plan_dest = extract_plan(pdf_file, files_dest)
|
||||||
|
save = []
|
||||||
|
for k, p in plan_dest.items():
|
||||||
|
if not p.exists() or force:
|
||||||
|
save.append(k)
|
||||||
|
|
||||||
|
if only_plan:
|
||||||
|
for s in save:
|
||||||
|
logging.info(f"Planing to create {plan_dest[s]}")
|
||||||
|
else:
|
||||||
|
files_dest.mkdir(parents=True, exist_ok=True)
|
||||||
|
extract_save(pdf_file, files_dest, save)
|
||||||
|
|
||||||
|
|
||||||
@main.command()
|
@main.command()
|
||||||
|
Loading…
Reference in New Issue
Block a user