From 8397e359b0b2a0f5c37a4f0c58adfb4e03a53b9b Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Mon, 10 Oct 2022 21:53:12 +0200 Subject: [PATCH] Feat: extract "remise commercial" from 1st page --- pdf_oralia/extract.py | 11 +++++++++-- pdf_oralia/extract_charge.py | 10 ++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pdf_oralia/extract.py b/pdf_oralia/extract.py index 830870f..fd69757 100644 --- a/pdf_oralia/extract.py +++ b/pdf_oralia/extract.py @@ -1,9 +1,10 @@ import logging from pathlib import Path +import pandas as pd import pdfplumber -from .extract_charge import extract_charge +from .extract_charge import extract_charge, extract_remise_com from .extract_locataire import extract_situation_loc charge_table_settings = { @@ -16,6 +17,11 @@ def extract_from_pdf(pdf, charge_dest, location_dest): """Build charge_dest and location_dest xlsx file from pdf""" loc_tables = [] charge_table = [] + + df_1st_charge = extract_remise_com( + pdf.pages[0].extract_table(charge_table_settings) + ) + for page in pdf.pages[1:]: page_text = page.extract_text() situation_loc_line = [ @@ -35,7 +41,8 @@ def extract_from_pdf(pdf, charge_dest, location_dest): charge_table = page.extract_table(charge_table_settings) df_charge = extract_charge(charge_table) - df_charge.to_excel(charge_dest, sheet_name="Charges", index=False) + df_charge_with_1st = pd.concat([df_1st_charge, df_charge]) + df_charge_with_1st.to_excel(charge_dest, sheet_name="Charges", index=False) logging.info(f"{charge_dest} saved") df_loc = extract_situation_loc(loc_tables, mois=mois, annee=annee) diff --git a/pdf_oralia/extract_charge.py b/pdf_oralia/extract_charge.py index a52158f..4c49926 100644 --- a/pdf_oralia/extract_charge.py +++ b/pdf_oralia/extract_charge.py @@ -51,3 +51,13 @@ def extract_charge(table): ) return df + + +def extract_remise_com(table): + """Extract "remise commercial" from first page""" + df = pd.DataFrame(table[1:], columns=table[0]).replace("", np.nan) + return df[ + df["RECAPITULATIF DES OPERATIONS"].str.contains( + "Remise commerciale gérance", case=False, na=False + ) + ]