From 64dd44488f6735f778ab4ea6e42d354a13479d8f Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Wed, 28 Sep 2022 09:56:35 +0200 Subject: [PATCH] Feat: add logging and don't watch first page --- pdf_oralia/extract.py | 17 +++++++++-------- pdf_oralia/extract_charge.py | 21 ++++++++++++--------- pdf_oralia/scripts.py | 21 +++++++++++++++++++++ 3 files changed, 42 insertions(+), 17 deletions(-) diff --git a/pdf_oralia/extract.py b/pdf_oralia/extract.py index c8efd50..bc2135f 100644 --- a/pdf_oralia/extract.py +++ b/pdf_oralia/extract.py @@ -1,3 +1,4 @@ +import logging from pathlib import Path import pdfplumber @@ -14,11 +15,10 @@ charge_table_settings = { def extract_from_pdf(pdf, charge_dest, location_dest): """Build charge_dest and location_dest xlsx file from pdf""" loc_table = [] - for page in pdf.pages: + for page in pdf.pages[1:]: + page_text = page.extract_text() situation_loc_line = [ - l - for l in page.extract_text().split("\n") - if "SITUATION DES LOCATAIRES" in l + l for l in page_text.split("\n") if "SITUATION DES LOCATAIRES" in l ] if situation_loc_line: mois, annee = situation_loc_line[0].split(" ")[-2:] @@ -27,15 +27,16 @@ def extract_from_pdf(pdf, charge_dest, location_dest): else: loc_table = page.extract_table() - if "HONORAIRES" in page.extract_text(): - df_charge = extract_charge(page.extract_table(charge_table_settings)) + elif "HONORAIRES" in page_text: + table = page.extract_table(charge_table_settings) + df_charge = extract_charge(table) df_charge.to_excel(charge_dest, sheet_name="Charges", index=False) - print(f"{charge_dest} saved") + logging.info(f"{charge_dest} saved") df_loc = extract_situation_loc(loc_table, mois=mois, annee=annee) df_loc = df_loc.assign() df_loc.to_excel(location_dest, sheet_name="Location", index=False) - print(f"{location_dest} saved") + logging.info(f"{location_dest} saved") def extract_save(pdf_file, dest): diff --git a/pdf_oralia/extract_charge.py b/pdf_oralia/extract_charge.py index 98d9d76..18bd267 100644 --- a/pdf_oralia/extract_charge.py +++ b/pdf_oralia/extract_charge.py @@ -1,3 +1,5 @@ +import logging + import numpy as np import pandas as pd @@ -8,15 +10,6 @@ def extract_charge(table): pd.DataFrame(table[1:], columns=table[0]) .replace("", np.nan) .dropna(subset=["Débits"]) - .astype( - { - "Débits": "float64", - "Crédits": "float64", - "Dont T.V.A.": "float64", - "Locatif": "float64", - "Déductible": "float64", - } - ) ) drop_index = df[ df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False) @@ -30,4 +23,14 @@ def extract_charge(table): inplace=True, ) + df = df.astype( + { + "Débits": "float64", + "Crédits": "float64", + "Dont T.V.A.": "float64", + "Locatif": "float64", + "Déductible": "float64", + } + ) + return df diff --git a/pdf_oralia/scripts.py b/pdf_oralia/scripts.py index fe7dff9..b73d284 100644 --- a/pdf_oralia/scripts.py +++ b/pdf_oralia/scripts.py @@ -1,9 +1,29 @@ +import logging +from logging.config import dictConfig from pathlib import Path import click from .extract import extract_save +logging_config = dict( + version=1, + formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}}, + handlers={ + "h": { + "class": "logging.StreamHandler", + "formatter": "f", + "level": logging.DEBUG, + } + }, + root={ + "handlers": ["h"], + "level": logging.DEBUG, + }, +) + +dictConfig(logging_config) + @click.group() def main(): @@ -37,6 +57,7 @@ def all(folder, dest): pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)] for pdf_file in pdf_files: + logging.info(f"Found {pdf_file}") extract_save(pdf_file, d)