pdf_auralia/pdf_oralia/extract_charge.py

51 lines
1.3 KiB
Python
Raw Normal View History

import logging
2022-09-27 14:07:06 +00:00
import numpy as np
import pandas as pd
2022-09-28 19:03:53 +00:00
def get_lot(x):
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
if x[:2].isdigit():
return x[:2]
if x[:1].isdigit():
return "0" + x[:1]
if x[:2] == "PC":
return "PC"
return ""
2022-09-27 14:07:06 +00:00
def extract_charge(table):
"""From pdfplumber table extract the charge dataframe"""
df = (
pd.DataFrame(table[1:], columns=table[0])
.replace("", np.nan)
.dropna(subset=["Débits"])
)
drop_index = df[
df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
2022-09-28 18:49:58 +00:00
| df["RECAPITULATIF DES OPERATIONS"].str.contains("Solde créditeur", case=False)
| df["RECAPITULATIF DES OPERATIONS"].str.contains("Solde débiteur", case=False)
2022-09-27 14:07:06 +00:00
].index
df.drop(drop_index, inplace=True)
2022-09-27 19:14:27 +00:00
df[""].mask(
df["RECAPITULATIF DES OPERATIONS"].str.contains("honoraires", case=False),
"IMI GERANCE",
inplace=True,
)
2022-09-28 19:03:53 +00:00
df = df.assign(lot=df["RECAPITULATIF DES OPERATIONS"].map(get_lot))
df = df.astype(
{
"Débits": "float64",
"Crédits": "float64",
"Dont T.V.A.": "float64",
"Locatif": "float64",
"Déductible": "float64",
}
)
2022-09-27 14:07:06 +00:00
return df