Feat: add logging and don't watch first page

This commit is contained in:
Bertrand Benjamin 2022-09-28 09:56:35 +02:00
parent 47c810a024
commit 64dd44488f
3 changed files with 42 additions and 17 deletions

View File

@ -1,3 +1,4 @@
import logging
from pathlib import Path from pathlib import Path
import pdfplumber import pdfplumber
@ -14,11 +15,10 @@ charge_table_settings = {
def extract_from_pdf(pdf, charge_dest, location_dest): def extract_from_pdf(pdf, charge_dest, location_dest):
"""Build charge_dest and location_dest xlsx file from pdf""" """Build charge_dest and location_dest xlsx file from pdf"""
loc_table = [] loc_table = []
for page in pdf.pages: for page in pdf.pages[1:]:
page_text = page.extract_text()
situation_loc_line = [ situation_loc_line = [
l l for l in page_text.split("\n") if "SITUATION DES LOCATAIRES" in l
for l in page.extract_text().split("\n")
if "SITUATION DES LOCATAIRES" in l
] ]
if situation_loc_line: if situation_loc_line:
mois, annee = situation_loc_line[0].split(" ")[-2:] mois, annee = situation_loc_line[0].split(" ")[-2:]
@ -27,15 +27,16 @@ def extract_from_pdf(pdf, charge_dest, location_dest):
else: else:
loc_table = page.extract_table() loc_table = page.extract_table()
if "HONORAIRES" in page.extract_text(): elif "HONORAIRES" in page_text:
df_charge = extract_charge(page.extract_table(charge_table_settings)) table = page.extract_table(charge_table_settings)
df_charge = extract_charge(table)
df_charge.to_excel(charge_dest, sheet_name="Charges", index=False) df_charge.to_excel(charge_dest, sheet_name="Charges", index=False)
print(f"{charge_dest} saved") logging.info(f"{charge_dest} saved")
df_loc = extract_situation_loc(loc_table, mois=mois, annee=annee) df_loc = extract_situation_loc(loc_table, mois=mois, annee=annee)
df_loc = df_loc.assign() df_loc = df_loc.assign()
df_loc.to_excel(location_dest, sheet_name="Location", index=False) df_loc.to_excel(location_dest, sheet_name="Location", index=False)
print(f"{location_dest} saved") logging.info(f"{location_dest} saved")
def extract_save(pdf_file, dest): def extract_save(pdf_file, dest):

View File

@ -1,3 +1,5 @@
import logging
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -8,15 +10,6 @@ def extract_charge(table):
pd.DataFrame(table[1:], columns=table[0]) pd.DataFrame(table[1:], columns=table[0])
.replace("", np.nan) .replace("", np.nan)
.dropna(subset=["Débits"]) .dropna(subset=["Débits"])
.astype(
{
"Débits": "float64",
"Crédits": "float64",
"Dont T.V.A.": "float64",
"Locatif": "float64",
"Déductible": "float64",
}
)
) )
drop_index = df[ drop_index = df[
df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False) df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
@ -30,4 +23,14 @@ def extract_charge(table):
inplace=True, inplace=True,
) )
df = df.astype(
{
"Débits": "float64",
"Crédits": "float64",
"Dont T.V.A.": "float64",
"Locatif": "float64",
"Déductible": "float64",
}
)
return df return df

View File

@ -1,9 +1,29 @@
import logging
from logging.config import dictConfig
from pathlib import Path from pathlib import Path
import click import click
from .extract import extract_save from .extract import extract_save
logging_config = dict(
version=1,
formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
handlers={
"h": {
"class": "logging.StreamHandler",
"formatter": "f",
"level": logging.DEBUG,
}
},
root={
"handlers": ["h"],
"level": logging.DEBUG,
},
)
dictConfig(logging_config)
@click.group() @click.group()
def main(): def main():
@ -37,6 +57,7 @@ def all(folder, dest):
pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)] pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)]
for pdf_file in pdf_files: for pdf_file in pdf_files:
logging.info(f"Found {pdf_file}")
extract_save(pdf_file, d) extract_save(pdf_file, d)