29 Commits

Author SHA1 Message Date
f12e5c05a1 add secrets.env to gitignore 2025-03-15 09:32:49 +01:00
3b9db43f0d jupyter notebook 2025-03-15 09:30:33 +01:00
8df9be825f Feat: nothing 2025-03-01 13:55:45 +01:00
d0fc473134 Fix: cas where impayé raise error 2025-03-01 13:51:15 +01:00
092b925b68 doc: add utilisation in readme 2025-02-26 09:08:17 +01:00
3c18bd5d81 Feat: script command all works recursively 2025-02-26 09:02:33 +01:00
4ee78a7e7b Feat: specify page type before extracting it 2025-02-26 05:58:38 +01:00
ce8cdc4c1e Feat: use fsm to extract lines from pdf 2025-02-26 05:54:44 +01:00
6e0ffe9085 core: change pandas version 2024-10-16 06:47:55 +02:00
ab2fdb0541 Feat: make from_pdf importable and move plumber in it 2024-10-16 06:47:25 +02:00
0fc39ed317 Merge pull request 'Update dependency MarkupSafe to v2.1.5' (#17) from renovate/markupsafe-2.x into main
Reviewed-on: #17
2024-02-17 05:08:19 +00:00
a6d6681756 Merge branch 'main' into renovate/markupsafe-2.x 2024-02-17 05:08:08 +00:00
4eecb3a44c Merge pull request 'Update dependency Jinja2 to v3.1.3' (#16) from renovate/jinja2-3.x into main
Reviewed-on: #16
2024-02-17 05:07:48 +00:00
60da623323 Update dependency MarkupSafe to v2.1.5 2024-02-17 05:04:52 +00:00
1f1e3e2741 Update dependency Jinja2 to v3.1.3 2024-02-17 05:04:48 +00:00
2b3e935f39 Merge pull request 'Update dependency Send2Trash to v1.8.2' (#15) from renovate/send2trash-1.x into main
Reviewed-on: #15
2024-02-17 04:56:16 +00:00
ef63f22d44 Merge pull request 'Update dependency MarkupSafe to v2.1.3' (#14) from renovate/markupsafe-2.x into main
Reviewed-on: #14
2024-02-17 04:55:55 +00:00
1020ef9257 Update dependency Send2Trash to v1.8.2 2024-01-10 11:04:32 +00:00
39084ceebd Update dependency MarkupSafe to v2.1.3 2024-01-10 11:04:30 +00:00
7de6c8dd9c clean renovate.json 2024-01-10 10:46:45 +00:00
da3815eea6 activate renovate 2024-01-09 06:53:09 +00:00
45d343d810 Feat: add raise error when src does not exists 2024-01-02 22:22:58 +01:00
806227f202 Feat: add logging in join 2023-12-30 17:45:15 +01:00
7bf0c38883 Feat: add option for debugging 2023-12-30 17:25:40 +01:00
b15b059e2a Add debug 2023-12-27 19:58:12 +01:00
48e75358ac Fix: remove index in excel outputs 2023-10-05 15:22:14 +02:00
132e37267b Feat: logging and option about overwritting 2023-10-05 15:19:16 +02:00
f2bcf6241a Fix: rebuild join_excel 2023-10-05 15:10:39 +02:00
ec9cc19be5 fix: remove when 2023-09-20 09:37:50 +02:00
15 changed files with 2873 additions and 1998 deletions

View File

@@ -27,7 +27,7 @@ steps:
environment: environment:
MATRIX_ROOMID: MATRIX_ROOMID:
from_secret: MATRIX_ROOMID from_secret: MATRIX_ROOMID
MATRIX_ACCESSTOKEN: MATRIX_ACCESSTOKEN:
from_secret: MATRIX_ACCESSTOKEN from_secret: MATRIX_ACCESSTOKEN
MATRIX_USERID: MATRIX_USERID:
from_secret: MATRIX_USERID from_secret: MATRIX_USERID
@@ -35,11 +35,6 @@ steps:
homeserver: https://matrix.poneyworld.net homeserver: https://matrix.poneyworld.net
template: "Une nouvelle version (${DRONE_TAG}) de pdf-oralia est publiée!" template: "Une nouvelle version (${DRONE_TAG}) de pdf-oralia est publiée!"
when:
event:
include:
- tag
# Déclencheur de la pipeline # Déclencheur de la pipeline
trigger: trigger:
event: event:

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
secrets.env
pdfs/ pdfs/
output/ output/

File diff suppressed because one or more lines are too long

View File

@@ -1,3 +1,23 @@
# PDF AURALIA # PDF AURALIA
Extraction de fichiers de comptabilité en pdf vers xlsx. Extraction de fichiers de comptabilité en pdf vers xlsx.
## Utilisation
- Lancement sur un fichier pdf particulier
```bash
pdf_oralia extract on <pdf_file> --dest <where to put producted files>
```
- Lancement sur tous les fichiers d'un repertoire (récursivement )
```bash
pdf_oralia extract all --src <source folder> --dest <destination folder>
```
Cette commande reproduira la structure du dossier source dans destination. Seul les fichiers non existants seront traités. Par default, les fichiers déjà produits ne seront pas écrasés.
On peut ajouter les options suivantes:
- `--force`: pour écraser les fichiers déjà traités
- `--only-plan`: pour voir quels fichiers pourraient être créé sans le faire.

View File

@@ -0,0 +1 @@
from .extract import from_pdf

View File

@@ -1,10 +1,11 @@
import logging import logging
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
import pandas as pd
import pdfplumber import pdfplumber
from pdf_oralia.pages import charge, locataire, patrimoine, recapitulatif from pdf_oralia.pages import charge, locataire, patrimoine
extract_table_settings = { extract_table_settings = {
"vertical_strategy": "lines", "vertical_strategy": "lines",
@@ -12,6 +13,10 @@ extract_table_settings = {
} }
class ExtractError(Exception):
pass
def extract_date(page_text): def extract_date(page_text):
"""Extract date from a page """Extract date from a page
@@ -32,68 +37,100 @@ def extract_building(page_text, buildings=["bloch", "marietton", "servient"]):
raise ValueError("Pas d'immeuble trouvé") raise ValueError("Pas d'immeuble trouvé")
def catch_malformed_table(tables): def pdf_extract_tables_lines(pdf):
if len(tables) == 2: loc_sink = locataire.fsm()
return tables[0] + tables[1] next(loc_sink)
return tables[0] charge_sink = charge.fsm()
next(charge_sink)
patrimoine_sink = patrimoine.fsm()
def from_pdf(pdf): next(patrimoine_sink)
"""Build dataframes one about charges and another on loc"""
recapitulatif_tables = []
loc_tables = []
charge_tables = []
patrimoie_tables = []
for page_number, page in enumerate(pdf.pages): for page_number, page in enumerate(pdf.pages):
page_text = page.extract_text() page_text = page.extract_text()
date = extract_date(page_text) date = extract_date(page_text)
additionnal_fields = { try:
"immeuble": extract_building(page_text), additionnal_fields = {
"mois": date.strftime("%m"), "immeuble": extract_building(page_text),
"annee": date.strftime("%Y"), "mois": date.strftime("%m"),
} "annee": date.strftime("%Y"),
}
if recapitulatif.is_it(page_text): except ValueError:
table = page.extract_tables()[0] logging.warning(
extracted = recapitulatif.extract(table, additionnal_fields) f"L'immeuble de la page {page_number+1} non identifiable. Page ignorée."
if extracted: )
recapitulatif_tables.append(extracted) continue
table_type = ""
elif locataire.is_it(page_text): if locataire.is_it(page_text):
tables = page.extract_tables(extract_table_settings)[1:] table_type = "locataire"
table = catch_malformed_table(tables)
extracted = locataire.extract(table, additionnal_fields)
loc_tables.append(extracted)
elif charge.is_it(page_text): elif charge.is_it(page_text):
tables = page.extract_tables(extract_table_settings)[1:] table_type = "charge"
table = catch_malformed_table(tables)
extracted = charge.extract(table, additionnal_fields)
charge_tables.append(extracted)
elif patrimoine.is_it(page_text): elif patrimoine.is_it(page_text):
pass table_type = "patrimoine"
else:
logging.warning(
f"Type de la page {page_number+1} non identifiable. Page ignorée."
)
continue
for line in page.extract_table(extract_table_settings):
if table_type == "locataire":
res = loc_sink.send(line)
if res:
res.update(additionnal_fields)
yield locataire.Line(**res)
elif table_type == "charge":
res = charge_sink.send(line)
if res:
res.update(additionnal_fields)
yield charge.Line(**res)
elif table_type == "patrimoine":
res = patrimoine_sink.send(line)
if res:
res.update(additionnal_fields)
yield patrimoine.Line(**res)
def from_pdf(pdf_file):
"""Build dataframes one about charges and another on loc"""
pdf = pdfplumber.open(pdf_file)
locataire_lines = []
charge_lines = []
patrimoine_lines = []
for line in pdf_extract_tables_lines(pdf):
if isinstance(line, locataire.Line):
locataire_lines.append(line)
elif isinstance(line, charge.Line):
charge_lines.append(line)
elif isinstance(line, patrimoine.Line):
patrimoine_lines.append(line)
else: else:
logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.") logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")
df_charge = charge.table2df(recapitulatif_tables + charge_tables) return {
df_loc = locataire.table2df(loc_tables) "charge": pd.DataFrame([c.__dict__ for c in charge_lines]),
"locataire": pd.DataFrame([c.__dict__ for c in locataire_lines]),
return df_charge, df_loc "patrimoine": pd.DataFrame([c.__dict__ for c in patrimoine_lines]),
}
def extract_save(pdf_file, dest): def extract_plan(pdf_file, dest):
return {
"charge": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx",
"locataire": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx",
"patrimoine": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx",
}
def extract_save(pdf_file, dest, save=[]):
"""Extract charge and locataire for pdf_file and put xlsx file in dest""" """Extract charge and locataire for pdf_file and put xlsx file in dest"""
pdf_file = Path(pdf_file) pdf_file = Path(pdf_file)
xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx" xlss = extract_plan(pdf_file, dest)
xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
pdf = pdfplumber.open(pdf_file) dfs = from_pdf(pdf_file)
df_charge, df_loc = from_pdf(pdf)
df_charge.to_excel(xls_charge, sheet_name="Charges", index=False) for s in save:
logging.info(f"{xls_charge} saved") dfs[s].to_excel(xlss[s], sheet_name=s, index=False)
df_loc.to_excel(xls_locataire, sheet_name="Location", index=False) logging.info(f"{xlss[s]} saved")
logging.info(f"{xls_locataire} saved")
return {k: v for k, v in xlss.items() if k in save}

29
pdf_oralia/join.py Normal file
View File

@@ -0,0 +1,29 @@
import glob
import logging
import pandas as pd
def join_excel(src, dest, file_pattern):
"""Join every excel file in arc respecting file_pattern into on unique file in dist"""
filenames = list_files(src, file_pattern)
logging.debug(f"Concatenate {filenames}")
dfs = extract_dfs(filenames)
joined_df = pd.concat(dfs)
logging.debug(f"Writing joined excel to {dest}")
joined_df.to_excel(dest, index=False)
logging.debug(f"with {len(joined_df)} rows")
def list_files(src, file_glob):
return list(glob.iglob(f"{src}/{file_glob}"))
def extract_dfs(filenames):
dfs = []
for filename in filenames:
logging.debug(f"Extracting {filename}")
df = pd.read_excel(filename)
logging.debug(f"Found {len(df)} rows")
dfs.append(df)
return dfs

View File

@@ -1,9 +1,16 @@
import re import re
from pydantic import BaseModel, field_validator
import numpy as np
import pandas as pd
RECAPITULATIF_DES_OPERATIONS = 1 HEADER_CHARGE = [
"",
"RECAPITULATIF DES OPERATIONS",
"Débits",
"Crédits",
"Dont T.V.A.",
"Locatif",
"Déductible",
]
DF_TYPES = { DF_TYPES = {
"Fournisseur": str, "Fournisseur": str,
"RECAPITULATIF DES OPERATIONS": str, "RECAPITULATIF DES OPERATIONS": str,
@@ -17,7 +24,30 @@ DF_TYPES = {
"annee": str, "annee": str,
"lot": str, "lot": str,
} }
DEFAULT_FOURNISSEUR = "ROSIER MODICA MOTTEROZ SA"
class Line(BaseModel):
mois: int
annee: int
immeuble: str
lot: str
Champs: str
Categorie: str
Fournisseur: str
Libellé: str
Débit: float
Crédits: float
Dont_TVA: float
Locatif: float
Déductible: float
@field_validator(
"Débit", "Crédits", "Dont_TVA", "Locatif", "Déductible", mode="before"
)
def set_default_if_empty(cls, v):
if v == "":
return 0
return v
def is_it(page_text): def is_it(page_text):
@@ -41,51 +71,54 @@ def get_lot(txt):
return "*" return "*"
def keep_row(row): def fsm():
return not any( current_state = "total"
[ row = {}
word.lower() in row[RECAPITULATIF_DES_OPERATIONS].lower() line = yield
for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"] while True:
] if line == HEADER_CHARGE:
) line = yield
if current_state == "total":
if line[1].lower().split(" ")[0] in ["total", "totaux"]:
def extract(table, additionnal_fields: dict = {}): current_state = "new_champs"
"""Turn table to dictionary with additional fields""" line = yield
extracted = [] elif current_state == "new_champs":
header = table[0] if line[0] != "":
for row in table[1:]: current_state = "new_cat_line"
if keep_row(row): row = {"Champs": line[0], "Categorie": "", "Fournisseur": ""}
r = dict() line = yield
for i, value in enumerate(row): elif current_state == "new_cat_line":
if header[i] == "": if line[1].lower().split(" ")[0] in ["total", "totaux"]:
r["Fournisseur"] = value current_state = "new_champs"
else: line = yield
r[header[i]] = value row = {}
elif line[2] != "" or line[3] != "":
for k, v in additionnal_fields.items(): row.update(
r[k] = v {
"Fournisseur": line[0] if line[0] != "" else row["Fournisseur"],
if "honoraire" in row[RECAPITULATIF_DES_OPERATIONS].lower(): "Libellé": line[1],
r["Fournisseur"] = DEFAULT_FOURNISSEUR "lot": get_lot(line[1]),
"Débit": line[2],
extracted.append(r) "Crédits": line[3],
"Dont_TVA": line[4],
return extracted "Locatif": line[5],
"Déductible": line[6],
}
def table2df(tables): )
dfs = [] line = yield row
for table in tables: row = {
df = ( "Champs": row["Champs"],
pd.DataFrame.from_records(table) "Categorie": row["Categorie"],
.replace("", np.nan) "Fournisseur": row["Fournisseur"],
.dropna(subset=["Débits", "Crédits"], how="all") }
) elif line[0] != "" and line[1] == "":
df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill") row.update({"Categorie": line[0]})
dfs.append(df) line = yield
df = pd.concat(dfs) elif line[1] != "":
row.update({"Categorie": line[1]})
df["immeuble"] = df["immeuble"].apply(lambda x: x[0].capitalize()) line = yield
df["lot"] = df["RECAPITULATIF DES OPERATIONS"].apply(get_lot) elif line[0] != "":
return df.astype(DF_TYPES) row.update({"Fournisseur": line[0]})
line = yield
else:
line = yield

View File

@@ -1,22 +1,48 @@
import numpy as np from pydantic import BaseModel, field_validator
import pandas as pd
DF_TYPES = { HEADER_LOC = [
"Locataires": str, "Locataires",
"Période": str, "Période",
"Loyers": float, "Loyers",
"Taxes": float, "Taxes",
"Provisions": float, "Provisions",
"Divers": str, "Divers",
"Total": float, "",
"Réglés": float, "Total",
"Impayés": float, "Réglés",
"immeuble": str, "Impayés",
"mois": str, ]
"annee": str,
"Lot": str,
"Type": str, class Line(BaseModel):
} mois: int
annee: int
immeuble: str
Lot: str
Type: str
Locataire: str
Loyers: float
Taxes: float
Provisions: float
Divers: float
Total: float
Réglés: float
Impayés: float
@field_validator(
"Loyers",
"Taxes",
"Provisions",
"Divers",
"Total",
"Réglés",
"Impayés",
mode="before",
)
def set_default_if_empty(cls, v):
if v == "":
return 0
return v
def is_it(page_text): def is_it(page_text):
@@ -25,142 +51,56 @@ def is_it(page_text):
return False return False
def is_drop(row):
if "totaux" in row[0].lower():
return True
if not any(row):
return True
return False
def extract(table, additionnal_fields: dict = {}):
"""Turn table to dictionary with additional fields"""
extracted = []
header = table[0]
for row in table[1:]:
if not is_drop(row):
r = dict()
for i, value in enumerate(row):
if header[i] != "":
r[header[i]] = value
for k, v in additionnal_fields.items():
r[k] = v
extracted.append(r)
return extracted
def join_row(last, next):
row = {}
for key in last:
if last[key] == next[key]:
row[key] = last[key]
elif last[key] and next[key]:
row[key] = f"{last[key]}\n{next[key]}"
elif last[key]:
row[key] = last[key]
elif next[key]:
row[key] = next[key]
else:
row[key] = ""
return row
def join_tables(tables):
joined = tables[0]
for t in tables[1:]:
last_row = joined[-1]
if "totaux" not in last_row["Locataires"].lower():
first_row = t[0]
joined_row = join_row(last_row, first_row)
joined = joined[:-1] + [joined_row] + t[1:]
else:
joined += t
return joined
def parse_lot(string): def parse_lot(string):
words = string.split(" ") words = string.split(" ")
return {"Lot": "{:02d}".format(int(words[1])), "Type": " ".join(words[2:])} return {"Lot": "{:02d}".format(int(words[1])), "Type": " ".join(words[2:])}
def clean_type(string): def fsm():
if "appartement" in string.lower(): current_state = "new_row"
return string[-2:] row = {}
return string line = yield
while True:
if line == HEADER_LOC:
def join_row(table): line = yield
joined = [] elif current_state == "new_row":
for row in table: if line[0] != "" and line[0] != "TOTAUX":
if row["Locataires"].startswith("Lot"): row.update(parse_lot(line[0]))
row.update(parse_lot(row["Locataires"])) current_state = "add_loc"
row["Locataires"] = "" line = yield
joined.append(row) elif current_state == "add_loc":
elif row["Locataires"] == "Rappel de Loyer": if line[0] != "":
last_row = joined[-1] row["Locataire"] = line[0]
row.update( current_state = "add_totaux"
{ line = yield
"Lot": last_row["Lot"], elif current_state == "add_totaux":
"Type": last_row["Type"], if line[0] == "Totaux":
"Locataires": last_row["Locataires"], if line[6] is None:
"Divers": "Rappel de Loyer", row.update(
} {
) "Loyers": line[2],
joined.append(row) "Taxes": line[3],
"Provisions": line[4],
elif row["Locataires"]: "Divers": line[5],
last_row = joined.pop() "Total": line[7],
row_name = row["Locataires"].replace("\n", " ") "Réglés": line[8],
row.update({k: v for k, v in last_row.items() if v}) "Impayés": line[9],
row["Locataires"] = last_row["Locataires"] + " " + row_name }
joined.append(row) )
else:
else: row.update(
if row["Période"].startswith("Solde"): {
last_row = joined.pop() "Loyers": line[2],
row.update( "Taxes": line[3],
{ "Provisions": line[4],
"Lot": last_row["Lot"], "Divers": line[5],
"Type": last_row["Type"], "Total": line[6],
"Locataires": last_row["Locataires"], "Réglés": line[7],
} "Impayés": line[8],
) }
joined.append(row) )
line = yield row
elif row["Période"].startswith("Du"): row = {}
last_row = joined[-1] current_state = "new_row"
row.update( else:
{ line = yield
"Lot": last_row["Lot"],
"Type": last_row["Type"],
"Locataires": last_row["Locataires"],
}
)
joined.append(row)
return joined
def flat_tables(tables):
tables_flat = []
for table in tables:
tables_flat.extend(table)
return tables_flat
def table2df(tables):
tables = flat_tables(tables)
joined = join_row(tables)
df = pd.DataFrame.from_records(joined)
df["immeuble"] = df["immeuble"].apply(lambda x: x[0].capitalize())
df["Type"] = df["Type"].apply(clean_type)
numeric_cols = [k for k, v in DF_TYPES.items() if v == float]
df[numeric_cols] = df[numeric_cols].replace("", np.nan)
df = df.drop(df[(df["Locataires"] == "") & (df["Période"] == "")].index)
return df.astype(DF_TYPES)

View File

@@ -1,4 +1,74 @@
from pydantic import BaseModel, field_validator
HEADER_PATRIMOINE = [
"Etage",
"Lots",
"Type de lot",
"Nom du Locataire",
"Loyer Annuel",
"Début Bail",
"Fin Bail",
"Entrée",
"Départ",
"Révisé le",
"U",
"Dépôt Gar.",
]
class Line(BaseModel):
mois: int
annee: int
immeuble: str
Etage: str
Lot: str
Type: str
Locataire: str
Loyer_annuel: int
Debut_bail: str
Fin_bail: str
Entree: str
Depart: str
Revision_bail: str
Usage: str
Depot_garantie: float
@field_validator("Loyer_annuel", "Depot_garantie", mode="before")
def set_default_if_empty(cls, v):
if v == "":
return 0
return v
def is_it(page_text): def is_it(page_text):
if "VOTRE PATRIMOINE" in page_text: if "VOTRE PATRIMOINE" in page_text:
return True return True
return False return False
def fsm():
current_state = "new_line"
row = {}
line = yield
while True:
if line == HEADER_PATRIMOINE:
line = yield
if current_state == "new_line":
if line[0] != "":
row = {
"Etage": line[0],
"Lot": line[1][-2:] if line[1] != "" else row["Lot"],
"Type": line[2] if line[2] != "" else row["Type"],
"Locataire": line[3],
"Loyer_annuel": line[4].replace(" ", ""),
"Debut_bail": line[5],
"Fin_bail": line[6],
"Entree": line[7],
"Depart": line[8],
"Revision_bail": line[9],
"Usage": line[10],
"Depot_garantie": line[11].replace(" ", ""),
}
line = yield row
else:
line = yield

View File

@@ -4,30 +4,34 @@ from pathlib import Path
import click import click
from .extract import extract_save from .extract import extract_save, extract_plan
from .join import join_excel
logging_config = dict(
version=1,
formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
handlers={
"h": {
"class": "logging.StreamHandler",
"formatter": "f",
"level": logging.DEBUG,
}
},
root={
"handlers": ["h"],
"level": logging.DEBUG,
},
)
dictConfig(logging_config)
@click.group() @click.group()
def main(): @click.option("--debug/--no-debug", default=False)
pass def main(debug):
if debug:
logging_level = logging.DEBUG
else:
logging_level = logging.INFO
logging_config = dict(
version=1,
formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
handlers={
"h": {
"class": "logging.StreamHandler",
"formatter": "f",
"level": logging_level,
}
},
root={
"handlers": ["h"],
"level": logging_level,
},
)
dictConfig(logging_config)
@main.group() @main.group()
@@ -38,31 +42,113 @@ def extract():
@extract.command() @extract.command()
@click.argument("pdf_file", required=1) @click.argument("pdf_file", required=1)
@click.option("--dest", help="Où mettre les fichiers produits", default="") @click.option("--dest", help="Où mettre les fichiers produits", default="")
def on(pdf_file, dest): @click.option(
"--only-plan",
help="Ne produit rien mais indique les changements",
default=False,
is_flag=True,
)
@click.option(
"--force",
help="Écrase les fichiers produits précédemment",
default=False,
is_flag=True,
)
def on(pdf_file, dest, force, only_plan):
pdf_file = Path(pdf_file)
if not dest: if not dest:
pdf_path = Path(pdf_file) pdf_path = Path(pdf_file)
dest = pdf_path.parent dest = pdf_path.parent
else:
dest = Path(dest)
extract_save(pdf_file, dest) assert pdf_file.exists()
logging.info(f"Found {pdf_file}")
plan_dest = extract_plan(pdf_file, dest)
save = []
for k, p in plan_dest.items():
if not p.exists() or force:
save.append(k)
if only_plan:
for s in save:
logging.info(f"Planing to create {plan_dest[s]}")
else:
dest.mkdir(parents=True, exist_ok=True)
extract_save(pdf_file, dest, save)
@extract.command() @extract.command()
@click.option("--src", help="Tous les fichiers dans folder", default="./") @click.option(
"--src", help="Tous les fichiers dans folder (de façon récursive)", default="./"
)
@click.option("--dest", help="Où mettre les fichiers produits", default="./") @click.option("--dest", help="Où mettre les fichiers produits", default="./")
def all(src, dest): @click.option(
p = Path(src) "--only-plan",
help="Ne produit rien mais indique les changements",
default=False,
is_flag=True,
)
@click.option(
"--force",
help="Écrase les fichiers produits précédemment",
default=False,
is_flag=True,
)
def all(src, dest, force, only_plan):
src_path = Path(src)
d = Path(dest) dest = Path(dest)
d.mkdir(exist_ok=True) dest.mkdir(exist_ok=True)
pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)] for pdf_file in src_path.rglob("**/*.pdf"):
for pdf_file in pdf_files: relative_path = pdf_file.relative_to(src_path)
files_dest = dest / relative_path.parent
logging.info(f"Found {pdf_file}") logging.info(f"Found {pdf_file}")
extract_save(pdf_file, d)
plan_dest = extract_plan(pdf_file, files_dest)
save = []
for k, p in plan_dest.items():
if not p.exists() or force:
save.append(k)
if only_plan:
for s in save:
logging.info(f"Planing to create {plan_dest[s]}")
else:
files_dest.mkdir(parents=True, exist_ok=True)
extract_save(pdf_file, files_dest, save)
@main.command() @main.command()
@click.option("--src", help="Tous les fichiers dans src", default="./") @click.option("--src", help="Tous les fichiers dans src", default="./")
@click.option("--dest", help="Où mettre les fichiers produits", default="") @click.option("--dest", help="Où mettre les fichiers produits", default="")
def join(src, dest): @click.option(
join_excel(src, dest, df_names=["charge", "locataire"]) "--force",
help="Ecraser si le ficher destination existe.",
default=False,
is_flag=True,
)
def join(src, dest, force):
"""Join tous les fichiers excel charge (resp locataire) de src dans un seul fichier charge.xlsx dans dist.
Exemple:
pdf-oralia join --src <dossier_source> --dest <dossier_destination>
"""
dest_charge = f"{dest}/charge.xlsx"
if not force and Path(dest_charge).exists():
raise ValueError(f"The file {dest_charge} already exists")
dest_locataire = f"{dest}/locataire.xlsx"
if not force and Path(dest_locataire).exists():
raise ValueError(f"The file {dest_locataire} already exists")
if not Path(src).exists():
raise ValueError(f"The source directory ({src}) does not exists.")
join_excel(src, dest_charge, "*_charge.xlsx")
logging.info(f"Les données charges ont été concaténées dans {dest_charge}")
join_excel(src, dest_locataire, "*_locataire.xlsx")
logging.info(f"Les données locataires ont été concaténées dans {dest_locataire}")

2927
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "pdf-oralia" name = "pdf-oralia"
version = "dev" version = "0"
description = "" description = ""
authors = ["Bertrand Benjamin <benjamin.bertrand@opytex.org>"] authors = ["Bertrand Benjamin <benjamin.bertrand@opytex.org>"]
readme = "README.md" readme = "README.md"
@@ -13,7 +13,7 @@ pdf-oralia = "pdf_oralia.scripts:main"
python = "^3.10" python = "^3.10"
click = "^8.1.3" click = "^8.1.3"
pdfplumber = "^0.7.4" pdfplumber = "^0.7.4"
pandas = "^1.5.0" pandas = "^2.2.3"
openpyxl = "^3.0.10" openpyxl = "^3.0.10"

2
renovate.json Normal file
View File

@@ -0,0 +1,2 @@
{
}

View File

@@ -1,76 +1,5 @@
argon2-cffi==21.3.0 pdfplumber
argon2-cffi-bindings==21.2.0 numpy
asttokens==2.0.8 pandas
attrs==22.1.0 click
backcall==0.2.0 openpyxl
beautifulsoup4==4.11.1
bleach==5.0.1
cffi==1.15.1
charset-normalizer==2.1.1
cryptography==38.0.1
debugpy==1.6.3
decorator==5.1.1
defusedxml==0.7.1
entrypoints==0.4
et-xmlfile==1.1.0
executing==1.1.0
fastjsonschema==2.16.2
ipykernel==6.16.0
ipython==8.5.0
ipython-genutils==0.2.0
ipywidgets==8.0.2
jedi==0.18.1
Jinja2==3.1.2
jsonschema==4.16.0
jupyter==1.0.0
jupyter-console==6.4.4
jupyter-core==4.11.1
jupyter_client==7.3.5
jupyterlab-pygments==0.2.2
jupyterlab-widgets==3.0.3
lxml==4.9.1
MarkupSafe==2.1.1
matplotlib-inline==0.1.6
mistune==2.0.4
nbclient==0.6.8
nbconvert==7.0.0
nbformat==5.6.1
nest-asyncio==1.5.5
notebook==6.4.12
numpy==1.23.3
openpyxl==3.0.10
packaging==21.3
pandas==1.5.0
pandocfilters==1.5.0
parso==0.8.3
pdfminer.six==20220524
pdfplumber==0.7.4
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.2.0
prometheus-client==0.14.1
prompt-toolkit==3.0.31
psutil==5.9.2
ptyprocess==0.7.0
pure-eval==0.2.2
pycparser==2.21
Pygments==2.13.0
pyparsing==3.0.9
pyrsistent==0.18.1
python-dateutil==2.8.2
pytz==2022.2.1
pyzmq==24.0.1
qtconsole==5.3.2
QtPy==2.2.0
Send2Trash==1.8.0
six==1.16.0
soupsieve==2.3.2.post1
stack-data==0.5.1
terminado==0.15.0
tinycss2==1.1.1
tornado==6.2
traitlets==5.4.0
Wand==0.6.10
wcwidth==0.2.5
webencodings==0.5.1
widgetsnbextension==4.0.3