Feat: split extract

This commit is contained in:
Bertrand Benjamin 2022-09-27 16:07:06 +02:00
parent 4031be77c6
commit e3cc7d18a2
3 changed files with 59 additions and 55 deletions

View File

@ -1,62 +1,9 @@
from pathlib import Path
import click
import numpy as np
import pandas as pd
import pdfplumber
def extract_situation_loc(table):
try:
df = pd.DataFrame(table[1:], columns=table[0])
except IndexError:
print(table)
rows = []
for i, row in df[df["Locataires"] == "Totaux"].iterrows():
above_row_loc = df.iloc[i - 1]["Locataires"]
up_row = pd.concat(
[
row,
parse_above_loc(above_row_loc),
]
)
rows.append(up_row)
df_cleaned = pd.concat(rows, axis=1).T
df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True)
return df_cleaned
def parse_above_loc(content):
row = {}
try:
app, loc = content.split("\n")
except ValueError:
row["lot"] = ""
row["type"] = ""
row["locataire"] = content
else:
app_ = app.split(" ")
row["lot"] = app_[1]
row["type"] = " ".join(app_[2:])
row["locataire"] = loc
return pd.Series(row)
def extract_charge(table):
df = (
pd.DataFrame(table[1:], columns=table[0])
.replace("", np.nan)
.dropna(subset=["Débits"])
)
drop_index = df[
df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
| df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False)
].index
df.drop(drop_index, inplace=True)
return df
from .extract_charge import extract_charge
from .extract_locataire import extract_situation_loc
charge_table_settings = {
"vertical_strategy": "lines",

View File

@ -0,0 +1,17 @@
import numpy as np
import pandas as pd
def extract_charge(table):
"""From pdfplumber table extract the charge dataframe"""
df = (
pd.DataFrame(table[1:], columns=table[0])
.replace("", np.nan)
.dropna(subset=["Débits"])
)
drop_index = df[
df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
| df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False)
].index
df.drop(drop_index, inplace=True)
return df

View File

@ -0,0 +1,40 @@
import pandas as pd
def parse_above_loc(content):
row = {}
try:
app, loc = content.split("\n")
except ValueError:
row["lot"] = ""
row["type"] = ""
row["locataire"] = content
else:
app_ = app.split(" ")
row["lot"] = app_[1]
row["type"] = " ".join(app_[2:])
row["locataire"] = loc
return pd.Series(row)
def extract_situation_loc(table):
"""From pdfplumber table extract locataire df"""
try:
df = pd.DataFrame(table[1:], columns=table[0])
except IndexError:
print(table)
rows = []
for i, row in df[df["Locataires"] == "Totaux"].iterrows():
above_row_loc = df.iloc[i - 1]["Locataires"]
up_row = pd.concat(
[
row,
parse_above_loc(above_row_loc),
]
)
rows.append(up_row)
df_cleaned = pd.concat(rows, axis=1).T
df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True)
return df_cleaned