Feat: change name of package
This commit is contained in:
parent
a2ac6c3c88
commit
75cd6c05c2
116
pdf_oralia/scripts.py
Normal file
116
pdf_oralia/scripts.py
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import click
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import pdfplumber
|
||||||
|
|
||||||
|
|
||||||
|
def extract_situation_loc(table):
|
||||||
|
try:
|
||||||
|
df = pd.DataFrame(table[1:], columns=table[0])
|
||||||
|
except IndexError:
|
||||||
|
print(table)
|
||||||
|
rows = []
|
||||||
|
for i, row in df[df["Locataires"] == "Totaux"].iterrows():
|
||||||
|
above_row_loc = df.iloc[i - 1]["Locataires"]
|
||||||
|
up_row = pd.concat(
|
||||||
|
[
|
||||||
|
row,
|
||||||
|
parse_above_loc(above_row_loc),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
rows.append(up_row)
|
||||||
|
df_cleaned = pd.concat(rows, axis=1).T
|
||||||
|
df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True)
|
||||||
|
return df_cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def parse_above_loc(content):
|
||||||
|
row = {}
|
||||||
|
try:
|
||||||
|
app, loc = content.split("\n")
|
||||||
|
except ValueError:
|
||||||
|
row["lot"] = ""
|
||||||
|
row["type"] = ""
|
||||||
|
row["locataire"] = content
|
||||||
|
|
||||||
|
else:
|
||||||
|
app_ = app.split(" ")
|
||||||
|
row["lot"] = app_[1]
|
||||||
|
row["type"] = " ".join(app_[2:])
|
||||||
|
row["locataire"] = loc
|
||||||
|
return pd.Series(row)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_charge(table):
|
||||||
|
df = (
|
||||||
|
pd.DataFrame(table[1:], columns=table[0])
|
||||||
|
.replace("", np.nan)
|
||||||
|
.dropna(subset=["Débits"])
|
||||||
|
)
|
||||||
|
drop_index = df[
|
||||||
|
df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
|
||||||
|
or df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False)
|
||||||
|
].index
|
||||||
|
df.drop(drop_index, inplace=True)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
charge_table_settings = {
|
||||||
|
"vertical_strategy": "lines",
|
||||||
|
"horizontal_strategy": "text",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_from_pdf(pdf, charge_dest, location_dest):
|
||||||
|
loc_table = []
|
||||||
|
for page in pdf.pages:
|
||||||
|
situation_loc_line = [
|
||||||
|
l
|
||||||
|
for l in page.extract_text().split("\n")
|
||||||
|
if "SITUATION DES LOCATAIRES" in l
|
||||||
|
]
|
||||||
|
if situation_loc_line:
|
||||||
|
mois, annee = situation_loc_line[0].split(" ")[-2:]
|
||||||
|
if loc_table:
|
||||||
|
loc_table += page.extract_table()[1:]
|
||||||
|
else:
|
||||||
|
loc_table = page.extract_table()
|
||||||
|
|
||||||
|
if "HONORAIRES" in page.extract_text():
|
||||||
|
df_charge = extract_charge(page.extract_table(charge_table_settings))
|
||||||
|
df_charge.to_excel(charge_dest, sheet_name="Charges", index=False)
|
||||||
|
print(f"{charge_dest} saved")
|
||||||
|
|
||||||
|
df_loc = extract_situation_loc(loc_table)
|
||||||
|
df_loc = df_loc.assign(mois=mois, annee=annee)
|
||||||
|
df_loc.to_excel(location_dest, sheet_name="Location", index=False)
|
||||||
|
print(f"{location_dest} saved")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_save(pdf_file):
|
||||||
|
pdf_file = Path(pdf_file)
|
||||||
|
xls_charge = f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
|
||||||
|
xls_locataire = f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
|
||||||
|
|
||||||
|
pdf = pdfplumber.open(pdf_file)
|
||||||
|
extract_from_pdf(pdf, xls_charge, xls_locataire)
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option("--pdf_file", help="Nom du fichier pdf", default="")
|
||||||
|
@click.option("--folder", help="Tous les fichiers dans folder", default="./")
|
||||||
|
def pdf2xlsx(pdf_file, folder):
|
||||||
|
if pdf_file:
|
||||||
|
extract_save(pdf_file)
|
||||||
|
else:
|
||||||
|
p = Path(folder)
|
||||||
|
pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)]
|
||||||
|
for pdf_file in pdf_files:
|
||||||
|
extract_save(pdf_file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pdf2xlsx()
|
@ -1,10 +1,13 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "pdf-auralia"
|
name = "pdf-oralia"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = ""
|
description = ""
|
||||||
authors = ["Bertrand Benjamin <benjamin.bertrand@opytex.org>"]
|
authors = ["Bertrand Benjamin <benjamin.bertrand@opytex.org>"]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
packages = [{include = "pdf_auralia"}]
|
packages = [{include = "pdf_oralia"}]
|
||||||
|
|
||||||
|
[tool.poetry.script]
|
||||||
|
pdf-oralia = "pdf_oralia.scripts:main"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.10"
|
python = "^3.10"
|
||||||
|
Loading…
Reference in New Issue
Block a user