Compare commits
33 Commits
c56241fe4c
...
main
Author | SHA1 | Date | |
---|---|---|---|
6e0ffe9085 | |||
ab2fdb0541 | |||
0fc39ed317 | |||
a6d6681756 | |||
4eecb3a44c | |||
60da623323 | |||
1f1e3e2741 | |||
2b3e935f39 | |||
ef63f22d44 | |||
1020ef9257 | |||
39084ceebd | |||
7de6c8dd9c | |||
da3815eea6 | |||
45d343d810 | |||
806227f202 | |||
7bf0c38883 | |||
b15b059e2a | |||
48e75358ac | |||
132e37267b | |||
f2bcf6241a | |||
ec9cc19be5 | |||
0040dccd9a | |||
b0333cddd8 | |||
406b89fea1 | |||
812d392720 | |||
6b77980e6c | |||
90c2d3689b | |||
f9be31c090 | |||
2761c3ed7b | |||
5692898137 | |||
44d4150910 | |||
223f25130d | |||
1a86b7bc26 |
18
.drone.yml
18
.drone.yml
@@ -12,7 +12,7 @@ steps:
|
||||
image: python:3.11
|
||||
commands:
|
||||
- echo ${DRONE_TAG}
|
||||
- sed -i "s/VERSION_PLACEHOLDER/${DRONE_TAG}/g" pyproject.toml
|
||||
- sed -i 's/version = "[^"]*"/version = "${DRONE_TAG}"/g' pyproject.toml
|
||||
- curl -sSL https://install.python-poetry.org | python3 -
|
||||
- export PATH="/root/.local/bin:$PATH"
|
||||
- poetry --version
|
||||
@@ -22,10 +22,18 @@ steps:
|
||||
PYPI_TOKEN:
|
||||
from_secret: pypi_token
|
||||
|
||||
when:
|
||||
event:
|
||||
include:
|
||||
- tag
|
||||
- name: Notify on matrix
|
||||
image: plugins/matrix
|
||||
environment:
|
||||
MATRIX_ROOMID:
|
||||
from_secret: MATRIX_ROOMID
|
||||
MATRIX_ACCESSTOKEN:
|
||||
from_secret: MATRIX_ACCESSTOKEN
|
||||
MATRIX_USERID:
|
||||
from_secret: MATRIX_USERID
|
||||
settings:
|
||||
homeserver: https://matrix.poneyworld.net
|
||||
template: "Une nouvelle version (${DRONE_TAG}) de pdf-oralia est publiée!"
|
||||
|
||||
# Déclencheur de la pipeline
|
||||
trigger:
|
||||
|
@@ -0,0 +1 @@
|
||||
from .extract import from_pdf
|
||||
|
@@ -38,14 +38,15 @@ def catch_malformed_table(tables):
|
||||
return tables[0]
|
||||
|
||||
|
||||
def from_pdf(pdf):
|
||||
def from_pdf(pdf_file):
|
||||
"""Build dataframes one about charges and another on loc"""
|
||||
pdf = pdfplumber.open(pdf_file)
|
||||
recapitulatif_tables = []
|
||||
loc_tables = []
|
||||
charge_tables = []
|
||||
patrimoie_tables = []
|
||||
|
||||
for page in pdf.pages:
|
||||
for page_number, page in enumerate(pdf.pages):
|
||||
page_text = page.extract_text()
|
||||
date = extract_date(page_text)
|
||||
additionnal_fields = {
|
||||
@@ -76,7 +77,7 @@ def from_pdf(pdf):
|
||||
pass
|
||||
|
||||
else:
|
||||
raise ValueError("Page non reconnu")
|
||||
logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")
|
||||
|
||||
df_charge = charge.table2df(recapitulatif_tables + charge_tables)
|
||||
df_loc = locataire.table2df(loc_tables)
|
||||
@@ -90,8 +91,7 @@ def extract_save(pdf_file, dest):
|
||||
xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
|
||||
xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
|
||||
|
||||
pdf = pdfplumber.open(pdf_file)
|
||||
df_charge, df_loc = from_pdf(pdf)
|
||||
df_charge, df_loc = from_pdf(pdf_file)
|
||||
|
||||
df_charge.to_excel(xls_charge, sheet_name="Charges", index=False)
|
||||
logging.info(f"{xls_charge} saved")
|
||||
|
29
pdf_oralia/join.py
Normal file
29
pdf_oralia/join.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import glob
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def join_excel(src, dest, file_pattern):
|
||||
"""Join every excel file in arc respecting file_pattern into on unique file in dist"""
|
||||
filenames = list_files(src, file_pattern)
|
||||
logging.debug(f"Concatenate {filenames}")
|
||||
dfs = extract_dfs(filenames)
|
||||
joined_df = pd.concat(dfs)
|
||||
logging.debug(f"Writing joined excel to {dest}")
|
||||
joined_df.to_excel(dest, index=False)
|
||||
logging.debug(f"with {len(joined_df)} rows")
|
||||
|
||||
|
||||
def list_files(src, file_glob):
|
||||
return list(glob.iglob(f"{src}/{file_glob}"))
|
||||
|
||||
|
||||
def extract_dfs(filenames):
|
||||
dfs = []
|
||||
for filename in filenames:
|
||||
logging.debug(f"Extracting {filename}")
|
||||
df = pd.read_excel(filename)
|
||||
logging.debug(f"Found {len(df)} rows")
|
||||
dfs.append(df)
|
||||
return dfs
|
@@ -3,7 +3,21 @@ import re
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
RECAPITULATIF_DES_OPERATION = 1
|
||||
RECAPITULATIF_DES_OPERATIONS = 1
|
||||
DF_TYPES = {
|
||||
"Fournisseur": str,
|
||||
"RECAPITULATIF DES OPERATIONS": str,
|
||||
"Débits": float,
|
||||
"Crédits": float,
|
||||
"Dont T.V.A.": float,
|
||||
"Locatif": float,
|
||||
"Déductible": float,
|
||||
"immeuble": str,
|
||||
"mois": str,
|
||||
"annee": str,
|
||||
"lot": str,
|
||||
}
|
||||
DEFAULT_FOURNISSEUR = "ROSIER MODICA MOTTEROZ SA"
|
||||
|
||||
|
||||
def is_it(page_text):
|
||||
@@ -17,8 +31,11 @@ def is_it(page_text):
|
||||
|
||||
def get_lot(txt):
|
||||
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
|
||||
regex = r"[BSM](\d+)\s-"
|
||||
regex = r"[BSM](\d+)(?=\s*-)"
|
||||
try:
|
||||
result = re.findall(regex, txt)
|
||||
except TypeError:
|
||||
return "*"
|
||||
if result:
|
||||
return "{:02d}".format(int(result[0]))
|
||||
return "*"
|
||||
@@ -27,14 +44,14 @@ def get_lot(txt):
|
||||
def keep_row(row):
|
||||
return not any(
|
||||
[
|
||||
word.lower() in row[RECAPITULATIF_DES_OPERATION].lower()
|
||||
word.lower() in row[RECAPITULATIF_DES_OPERATIONS].lower()
|
||||
for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def extract(table, additionnal_fields: dict = {}):
|
||||
"""Turn table to dictionary with additionnal fields"""
|
||||
"""Turn table to dictionary with additional fields"""
|
||||
extracted = []
|
||||
header = table[0]
|
||||
for row in table[1:]:
|
||||
@@ -49,10 +66,8 @@ def extract(table, additionnal_fields: dict = {}):
|
||||
for k, v in additionnal_fields.items():
|
||||
r[k] = v
|
||||
|
||||
r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATION])
|
||||
|
||||
if "honoraire" in row[RECAPITULATIF_DES_OPERATION]:
|
||||
r["Fournisseur"] = "IMI GERANCE"
|
||||
if "honoraire" in row[RECAPITULATIF_DES_OPERATIONS].lower():
|
||||
r["Fournisseur"] = DEFAULT_FOURNISSEUR
|
||||
|
||||
extracted.append(r)
|
||||
|
||||
@@ -69,4 +84,8 @@ def table2df(tables):
|
||||
)
|
||||
df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
|
||||
dfs.append(df)
|
||||
return pd.concat(dfs)
|
||||
df = pd.concat(dfs)
|
||||
|
||||
df["immeuble"] = df["immeuble"].apply(lambda x: x[0].capitalize())
|
||||
df["lot"] = df["RECAPITULATIF DES OPERATIONS"].apply(get_lot)
|
||||
return df.astype(DF_TYPES)
|
||||
|
@@ -1,5 +1,23 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
DF_TYPES = {
|
||||
"Locataires": str,
|
||||
"Période": str,
|
||||
"Loyers": float,
|
||||
"Taxes": float,
|
||||
"Provisions": float,
|
||||
"Divers": str,
|
||||
"Total": float,
|
||||
"Réglés": float,
|
||||
"Impayés": float,
|
||||
"immeuble": str,
|
||||
"mois": str,
|
||||
"annee": str,
|
||||
"Lot": str,
|
||||
"Type": str,
|
||||
}
|
||||
|
||||
|
||||
def is_it(page_text):
|
||||
if "SITUATION DES LOCATAIRES" in page_text:
|
||||
@@ -16,7 +34,7 @@ def is_drop(row):
|
||||
|
||||
|
||||
def extract(table, additionnal_fields: dict = {}):
|
||||
"""Turn table to dictionary with additionnal fields"""
|
||||
"""Turn table to dictionary with additional fields"""
|
||||
extracted = []
|
||||
header = table[0]
|
||||
for row in table[1:]:
|
||||
@@ -67,6 +85,12 @@ def parse_lot(string):
|
||||
return {"Lot": "{:02d}".format(int(words[1])), "Type": " ".join(words[2:])}
|
||||
|
||||
|
||||
def clean_type(string):
|
||||
if "appartement" in string.lower():
|
||||
return string[-2:]
|
||||
return string
|
||||
|
||||
|
||||
def join_row(table):
|
||||
joined = []
|
||||
for row in table:
|
||||
@@ -115,8 +139,6 @@ def join_row(table):
|
||||
}
|
||||
)
|
||||
joined.append(row)
|
||||
else:
|
||||
print(row)
|
||||
|
||||
return joined
|
||||
|
||||
@@ -131,4 +153,14 @@ def flat_tables(tables):
|
||||
def table2df(tables):
|
||||
tables = flat_tables(tables)
|
||||
joined = join_row(tables)
|
||||
return pd.DataFrame.from_records(joined)
|
||||
df = pd.DataFrame.from_records(joined)
|
||||
|
||||
df["immeuble"] = df["immeuble"].apply(lambda x: x[0].capitalize())
|
||||
df["Type"] = df["Type"].apply(clean_type)
|
||||
|
||||
numeric_cols = [k for k, v in DF_TYPES.items() if v == float]
|
||||
df[numeric_cols] = df[numeric_cols].replace("", np.nan)
|
||||
|
||||
df = df.drop(df[(df["Locataires"] == "") & (df["Période"] == "")].index)
|
||||
|
||||
return df.astype(DF_TYPES)
|
||||
|
@@ -3,11 +3,18 @@ from logging.config import dictConfig
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
import pandas as pd
|
||||
|
||||
from .extract import extract_save
|
||||
from .join import join_excel
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.option("--debug/--no-debug", default=False)
|
||||
def main(debug):
|
||||
if debug:
|
||||
logging_level = logging.DEBUG
|
||||
else:
|
||||
logging_level = logging.INFO
|
||||
logging_config = dict(
|
||||
version=1,
|
||||
formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
|
||||
@@ -15,23 +22,18 @@ logging_config = dict(
|
||||
"h": {
|
||||
"class": "logging.StreamHandler",
|
||||
"formatter": "f",
|
||||
"level": logging.DEBUG,
|
||||
"level": logging_level,
|
||||
}
|
||||
},
|
||||
root={
|
||||
"handlers": ["h"],
|
||||
"level": logging.DEBUG,
|
||||
"level": logging_level,
|
||||
},
|
||||
)
|
||||
|
||||
dictConfig(logging_config)
|
||||
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
pass
|
||||
|
||||
|
||||
@main.group()
|
||||
def extract():
|
||||
pass
|
||||
@@ -66,5 +68,31 @@ def all(src, dest):
|
||||
@main.command()
|
||||
@click.option("--src", help="Tous les fichiers dans src", default="./")
|
||||
@click.option("--dest", help="Où mettre les fichiers produits", default="")
|
||||
def join(src, dest):
|
||||
join_excel(src, dest, df_names=["charge", "locataire"])
|
||||
@click.option(
|
||||
"--force",
|
||||
help="Ecraser si le ficher destination existe.",
|
||||
default=False,
|
||||
is_flag=True,
|
||||
)
|
||||
def join(src, dest, force):
|
||||
"""Join tous les fichiers excel charge (resp locataire) de src dans un seul fichier charge.xlsx dans dist.
|
||||
|
||||
Exemple:
|
||||
|
||||
pdf-oralia join --src <dossier_source> --dest <dossier_destination>
|
||||
|
||||
|
||||
"""
|
||||
dest_charge = f"{dest}/charge.xlsx"
|
||||
if not force and Path(dest_charge).exists():
|
||||
raise ValueError(f"The file {dest_charge} already exists")
|
||||
dest_locataire = f"{dest}/locataire.xlsx"
|
||||
if not force and Path(dest_locataire).exists():
|
||||
raise ValueError(f"The file {dest_locataire} already exists")
|
||||
|
||||
if not Path(src).exists():
|
||||
raise ValueError(f"The source directory ({src}) does not exists.")
|
||||
join_excel(src, dest_charge, "*_charge.xlsx")
|
||||
logging.info(f"Les données charges ont été concaténées dans {dest_charge}")
|
||||
join_excel(src, dest_locataire, "*_locataire.xlsx")
|
||||
logging.info(f"Les données locataires ont été concaténées dans {dest_locataire}")
|
||||
|
2927
poetry.lock
generated
2927
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "pdf-oralia"
|
||||
version = "VERSION_PLACEHOLDER"
|
||||
version = "0"
|
||||
description = ""
|
||||
authors = ["Bertrand Benjamin <benjamin.bertrand@opytex.org>"]
|
||||
readme = "README.md"
|
||||
@@ -13,7 +13,7 @@ pdf-oralia = "pdf_oralia.scripts:main"
|
||||
python = "^3.10"
|
||||
click = "^8.1.3"
|
||||
pdfplumber = "^0.7.4"
|
||||
pandas = "^1.5.0"
|
||||
pandas = "^2.2.3"
|
||||
openpyxl = "^3.0.10"
|
||||
|
||||
|
||||
|
2
renovate.json
Normal file
2
renovate.json
Normal file
@@ -0,0 +1,2 @@
|
||||
{
|
||||
}
|
@@ -1,76 +1,3 @@
|
||||
argon2-cffi==21.3.0
|
||||
argon2-cffi-bindings==21.2.0
|
||||
asttokens==2.0.8
|
||||
attrs==22.1.0
|
||||
backcall==0.2.0
|
||||
beautifulsoup4==4.11.1
|
||||
bleach==5.0.1
|
||||
cffi==1.15.1
|
||||
charset-normalizer==2.1.1
|
||||
cryptography==38.0.1
|
||||
debugpy==1.6.3
|
||||
decorator==5.1.1
|
||||
defusedxml==0.7.1
|
||||
entrypoints==0.4
|
||||
et-xmlfile==1.1.0
|
||||
executing==1.1.0
|
||||
fastjsonschema==2.16.2
|
||||
ipykernel==6.16.0
|
||||
ipython==8.5.0
|
||||
ipython-genutils==0.2.0
|
||||
ipywidgets==8.0.2
|
||||
jedi==0.18.1
|
||||
Jinja2==3.1.2
|
||||
jsonschema==4.16.0
|
||||
jupyter==1.0.0
|
||||
jupyter-console==6.4.4
|
||||
jupyter-core==4.11.1
|
||||
jupyter_client==7.3.5
|
||||
jupyterlab-pygments==0.2.2
|
||||
jupyterlab-widgets==3.0.3
|
||||
lxml==4.9.1
|
||||
MarkupSafe==2.1.1
|
||||
matplotlib-inline==0.1.6
|
||||
mistune==2.0.4
|
||||
nbclient==0.6.8
|
||||
nbconvert==7.0.0
|
||||
nbformat==5.6.1
|
||||
nest-asyncio==1.5.5
|
||||
notebook==6.4.12
|
||||
numpy==1.23.3
|
||||
openpyxl==3.0.10
|
||||
packaging==21.3
|
||||
pandas==1.5.0
|
||||
pandocfilters==1.5.0
|
||||
parso==0.8.3
|
||||
pdfminer.six==20220524
|
||||
pdfplumber==0.7.4
|
||||
pexpect==4.8.0
|
||||
pickleshare==0.7.5
|
||||
Pillow==9.2.0
|
||||
prometheus-client==0.14.1
|
||||
prompt-toolkit==3.0.31
|
||||
psutil==5.9.2
|
||||
ptyprocess==0.7.0
|
||||
pure-eval==0.2.2
|
||||
pycparser==2.21
|
||||
Pygments==2.13.0
|
||||
pyparsing==3.0.9
|
||||
pyrsistent==0.18.1
|
||||
python-dateutil==2.8.2
|
||||
pytz==2022.2.1
|
||||
pyzmq==24.0.1
|
||||
qtconsole==5.3.2
|
||||
QtPy==2.2.0
|
||||
Send2Trash==1.8.0
|
||||
six==1.16.0
|
||||
soupsieve==2.3.2.post1
|
||||
stack-data==0.5.1
|
||||
terminado==0.15.0
|
||||
tinycss2==1.1.1
|
||||
tornado==6.2
|
||||
traitlets==5.4.0
|
||||
Wand==0.6.10
|
||||
wcwidth==0.2.5
|
||||
webencodings==0.5.1
|
||||
widgetsnbextension==4.0.3
|
||||
pdfplumber
|
||||
numpy
|
||||
pandas
|
||||
|
Reference in New Issue
Block a user