pdf_auralia/pdf_oralia/extract_charge.py

69 lines
1.8 KiB
Python

import logging
import numpy as np
import pandas as pd
def get_lot(x):
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
if x[:2].isdigit():
return x[:2]
if x[:1].isdigit():
return "0" + x[:1]
if x[:2] == "PC":
return "PC"
return ""
def extract_charge(table):
"""From pdfplumber table extract the charge dataframe"""
df = (
pd.DataFrame(table[1:], columns=table[0])
.replace("", np.nan)
.dropna(subset=["Débits", "Crédits"], how="all")
)
drop_index = df[
df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
| df["RECAPITULATIF DES OPERATIONS"].str.contains("Solde créditeur", case=False)
| df["RECAPITULATIF DES OPERATIONS"].str.contains("Solde débiteur", case=False)
| df["RECAPITULATIF DES OPERATIONS"].str.contains(
"Total des reglements locataires", case=False
)
].index
df.drop(drop_index, inplace=True)
df[""].mask(
df["RECAPITULATIF DES OPERATIONS"].str.contains("honoraires", case=False),
"IMI GERANCE",
inplace=True,
)
df = df.assign(lot=df["RECAPITULATIF DES OPERATIONS"].map(get_lot))
df = df.astype(
{
"Débits": "float64",
"Crédits": "float64",
"Dont T.V.A.": "float64",
"Locatif": "float64",
"Déductible": "float64",
}
)
df.columns.values[0] = "Fournisseur"
return df
def extract_remise_com(table):
"""Extract "remise commercial" from first page"""
df = pd.DataFrame(table[1:], columns=table[0]).replace("", np.nan)
df = df[
df["RECAPITULATIF DES OPERATIONS"].str.contains(
"Remise commerciale gérance", case=False, na=False
)
]
df.columns.values[0] = "Fournisseur"
return df