Feat: extract "remise commercial" from 1st page

This commit is contained in:
Bertrand Benjamin 2022-10-10 21:53:12 +02:00
parent a06720b93c
commit 8397e359b0
2 changed files with 19 additions and 2 deletions

View File

@ -1,9 +1,10 @@
import logging
from pathlib import Path
import pandas as pd
import pdfplumber
from .extract_charge import extract_charge
from .extract_charge import extract_charge, extract_remise_com
from .extract_locataire import extract_situation_loc
charge_table_settings = {
@ -16,6 +17,11 @@ def extract_from_pdf(pdf, charge_dest, location_dest):
"""Build charge_dest and location_dest xlsx file from pdf"""
loc_tables = []
charge_table = []
df_1st_charge = extract_remise_com(
pdf.pages[0].extract_table(charge_table_settings)
)
for page in pdf.pages[1:]:
page_text = page.extract_text()
situation_loc_line = [
@ -35,7 +41,8 @@ def extract_from_pdf(pdf, charge_dest, location_dest):
charge_table = page.extract_table(charge_table_settings)
df_charge = extract_charge(charge_table)
df_charge.to_excel(charge_dest, sheet_name="Charges", index=False)
df_charge_with_1st = pd.concat([df_1st_charge, df_charge])
df_charge_with_1st.to_excel(charge_dest, sheet_name="Charges", index=False)
logging.info(f"{charge_dest} saved")
df_loc = extract_situation_loc(loc_tables, mois=mois, annee=annee)

View File

@ -51,3 +51,13 @@ def extract_charge(table):
)
return df
def extract_remise_com(table):
"""Extract "remise commercial" from first page"""
df = pd.DataFrame(table[1:], columns=table[0]).replace("", np.nan)
return df[
df["RECAPITULATIF DES OPERATIONS"].str.contains(
"Remise commerciale gérance", case=False, na=False
)
]