Feat: make script work

This commit is contained in:
2022-09-27 16:01:09 +02:00
parent e7180b1c28
commit 4031be77c6
4 changed files with 492 additions and 126 deletions

View File

@@ -52,7 +52,7 @@ def extract_charge(table):
)
drop_index = df[
df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
or df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False)
| df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False)
].index
df.drop(drop_index, inplace=True)
return df
@@ -65,6 +65,7 @@ charge_table_settings = {
def extract_from_pdf(pdf, charge_dest, location_dest):
"""Build charge_dest and location_dest xlsx file from pdf"""
loc_table = []
for page in pdf.pages:
situation_loc_line = [
@@ -90,27 +91,11 @@ def extract_from_pdf(pdf, charge_dest, location_dest):
print(f"{location_dest} saved")
def extract_save(pdf_file):
def extract_save(pdf_file, dest):
"""Extract charge and locataire for pdf_file and put xlsx file in dest"""
pdf_file = Path(pdf_file)
xls_charge = f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
xls_locataire = f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
xls_charge = dest / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
xls_locataire = dest / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
pdf = pdfplumber.open(pdf_file)
extract_from_pdf(pdf, xls_charge, xls_locataire)
@click.command()
@click.option("--pdf_file", help="Nom du fichier pdf", default="")
@click.option("--folder", help="Tous les fichiers dans folder", default="./")
def pdf2xlsx(pdf_file, folder):
if pdf_file:
extract_save(pdf_file)
else:
p = Path(folder)
pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)]
for pdf_file in pdf_files:
extract_save(pdf_file)
if __name__ == "__main__":
pdf2xlsx()

View File

@@ -1,116 +1,40 @@
from pathlib import Path
import click
import numpy as np
import pandas as pd
import pdfplumber
from .extract import extract_save
def extract_situation_loc(table):
try:
df = pd.DataFrame(table[1:], columns=table[0])
except IndexError:
print(table)
rows = []
for i, row in df[df["Locataires"] == "Totaux"].iterrows():
above_row_loc = df.iloc[i - 1]["Locataires"]
up_row = pd.concat(
[
row,
parse_above_loc(above_row_loc),
]
)
rows.append(up_row)
df_cleaned = pd.concat(rows, axis=1).T
df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True)
return df_cleaned
@click.group()
def main():
pass
def parse_above_loc(content):
row = {}
try:
app, loc = content.split("\n")
except ValueError:
row["lot"] = ""
row["type"] = ""
row["locataire"] = content
else:
app_ = app.split(" ")
row["lot"] = app_[1]
row["type"] = " ".join(app_[2:])
row["locataire"] = loc
return pd.Series(row)
@main.group()
def extract():
pass
def extract_charge(table):
df = (
pd.DataFrame(table[1:], columns=table[0])
.replace("", np.nan)
.dropna(subset=["Débits"])
)
drop_index = df[
df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
or df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False)
].index
df.drop(drop_index, inplace=True)
return df
charge_table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "text",
}
def extract_from_pdf(pdf, charge_dest, location_dest):
loc_table = []
for page in pdf.pages:
situation_loc_line = [
l
for l in page.extract_text().split("\n")
if "SITUATION DES LOCATAIRES" in l
]
if situation_loc_line:
mois, annee = situation_loc_line[0].split(" ")[-2:]
if loc_table:
loc_table += page.extract_table()[1:]
else:
loc_table = page.extract_table()
if "HONORAIRES" in page.extract_text():
df_charge = extract_charge(page.extract_table(charge_table_settings))
df_charge.to_excel(charge_dest, sheet_name="Charges", index=False)
print(f"{charge_dest} saved")
df_loc = extract_situation_loc(loc_table)
df_loc = df_loc.assign(mois=mois, annee=annee)
df_loc.to_excel(location_dest, sheet_name="Location", index=False)
print(f"{location_dest} saved")
def extract_save(pdf_file):
pdf_file = Path(pdf_file)
xls_charge = f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
xls_locataire = f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
pdf = pdfplumber.open(pdf_file)
extract_from_pdf(pdf, xls_charge, xls_locataire)
@click.command()
@extract.command()
@click.option("--pdf_file", help="Nom du fichier pdf", default="")
def file(pdf_file):
extract_save(pdf_file)
@extract.command()
@click.option("--folder", help="Tous les fichiers dans folder", default="./")
def pdf2xlsx(pdf_file, folder):
if pdf_file:
extract_save(pdf_file)
else:
p = Path(folder)
pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)]
for pdf_file in pdf_files:
extract_save(pdf_file)
@click.option("--dest", help="Où mettre les fichiers produits", default="./")
def all(folder, dest):
p = Path(folder)
d = Path(dest)
d.mkdir(exist_ok=True)
pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)]
for pdf_file in pdf_files:
extract_save(pdf_file, d)
if __name__ == "__main__":
pdf2xlsx()
@main.command()
def join():
pass