15 Commits

7 changed files with 130 additions and 24 deletions

View File

@@ -12,7 +12,7 @@ steps:
image: python:3.11
commands:
- echo ${DRONE_TAG}
- sed -i "s/VERSION_PLACEHOLDER/${DRONE_TAG}/g" pyproject.toml
- sed -i 's/version = "[^"]*"/version = "${DRONE_TAG}"/g' pyproject.toml
- curl -sSL https://install.python-poetry.org | python3 -
- export PATH="/root/.local/bin:$PATH"
- poetry --version
@@ -22,10 +22,18 @@ steps:
PYPI_TOKEN:
from_secret: pypi_token
when:
event:
include:
- tag
- name: Notify on matrix
image: plugins/matrix
environment:
MATRIX_ROOMID:
from_secret: MATRIX_ROOMID
MATRIX_ACCESSTOKEN:
from_secret: MATRIX_ACCESSTOKEN
MATRIX_USERID:
from_secret: MATRIX_USERID
settings:
homeserver: https://matrix.poneyworld.net
template: "Une nouvelle version (${DRONE_TAG}) de pdf-oralia est publiée!"
# Déclencheur de la pipeline
trigger:

View File

@@ -45,7 +45,7 @@ def from_pdf(pdf):
charge_tables = []
patrimoie_tables = []
for page in pdf.pages:
for page_number, page in enumerate(pdf.pages):
page_text = page.extract_text()
date = extract_date(page_text)
additionnal_fields = {
@@ -76,7 +76,7 @@ def from_pdf(pdf):
pass
else:
raise ValueError("Page non reconnu")
logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")
df_charge = charge.table2df(recapitulatif_tables + charge_tables)
df_loc = locataire.table2df(loc_tables)

22
pdf_oralia/join.py Normal file
View File

@@ -0,0 +1,22 @@
import glob
import pandas as pd
def join_excel(src, dest, file_pattern):
"""Join every excel file in arc respecting file_pattern into on unique file in dist"""
filenames = list_files(src, file_pattern)
dfs = extract_dfs(filenames)
joined_df = pd.concat(dfs)
joined_df.to_excel(dest, index=False)
def list_files(src, file_glob):
return list(glob.iglob(f"{src}/{file_glob}"))
def extract_dfs(filenames):
dfs = []
for filename in filenames:
dfs.append(pd.read_excel(filename))
return dfs

View File

@@ -3,7 +3,21 @@ import re
import numpy as np
import pandas as pd
RECAPITULATIF_DES_OPERATION = 1
RECAPITULATIF_DES_OPERATIONS = 1
DF_TYPES = {
"Fournisseur": str,
"RECAPITULATIF DES OPERATIONS": str,
"Débits": float,
"Crédits": float,
"Dont T.V.A.": float,
"Locatif": float,
"Déductible": float,
"immeuble": str,
"mois": str,
"annee": str,
"lot": str,
}
DEFAULT_FOURNISSEUR = "ROSIER MODICA MOTTEROZ SA"
def is_it(page_text):
@@ -17,8 +31,11 @@ def is_it(page_text):
def get_lot(txt):
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
regex = r"[BSM](\d+)\s-"
result = re.findall(regex, txt)
regex = r"[BSM](\d+)(?=\s*-)"
try:
result = re.findall(regex, txt)
except TypeError:
return "*"
if result:
return "{:02d}".format(int(result[0]))
return "*"
@@ -27,14 +44,14 @@ def get_lot(txt):
def keep_row(row):
return not any(
[
word.lower() in row[RECAPITULATIF_DES_OPERATION].lower()
word.lower() in row[RECAPITULATIF_DES_OPERATIONS].lower()
for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
]
)
def extract(table, additionnal_fields: dict = {}):
"""Turn table to dictionary with additionnal fields"""
"""Turn table to dictionary with additional fields"""
extracted = []
header = table[0]
for row in table[1:]:
@@ -49,10 +66,8 @@ def extract(table, additionnal_fields: dict = {}):
for k, v in additionnal_fields.items():
r[k] = v
r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATION])
if "honoraire" in row[RECAPITULATIF_DES_OPERATION]:
r["Fournisseur"] = "IMI GERANCE"
if "honoraire" in row[RECAPITULATIF_DES_OPERATIONS].lower():
r["Fournisseur"] = DEFAULT_FOURNISSEUR
extracted.append(r)
@@ -69,4 +84,8 @@ def table2df(tables):
)
df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
dfs.append(df)
return pd.concat(dfs)
df = pd.concat(dfs)
df["immeuble"] = df["immeuble"].apply(lambda x: x[0].capitalize())
df["lot"] = df["RECAPITULATIF DES OPERATIONS"].apply(get_lot)
return df.astype(DF_TYPES)

View File

@@ -1,5 +1,23 @@
import numpy as np
import pandas as pd
DF_TYPES = {
"Locataires": str,
"Période": str,
"Loyers": float,
"Taxes": float,
"Provisions": float,
"Divers": str,
"Total": float,
"Réglés": float,
"Impayés": float,
"immeuble": str,
"mois": str,
"annee": str,
"Lot": str,
"Type": str,
}
def is_it(page_text):
if "SITUATION DES LOCATAIRES" in page_text:
@@ -16,7 +34,7 @@ def is_drop(row):
def extract(table, additionnal_fields: dict = {}):
"""Turn table to dictionary with additionnal fields"""
"""Turn table to dictionary with additional fields"""
extracted = []
header = table[0]
for row in table[1:]:
@@ -67,6 +85,12 @@ def parse_lot(string):
return {"Lot": "{:02d}".format(int(words[1])), "Type": " ".join(words[2:])}
def clean_type(string):
if "appartement" in string.lower():
return string[-2:]
return string
def join_row(table):
joined = []
for row in table:
@@ -115,8 +139,6 @@ def join_row(table):
}
)
joined.append(row)
else:
print(row)
return joined
@@ -131,4 +153,14 @@ def flat_tables(tables):
def table2df(tables):
tables = flat_tables(tables)
joined = join_row(tables)
return pd.DataFrame.from_records(joined)
df = pd.DataFrame.from_records(joined)
df["immeuble"] = df["immeuble"].apply(lambda x: x[0].capitalize())
df["Type"] = df["Type"].apply(clean_type)
numeric_cols = [k for k, v in DF_TYPES.items() if v == float]
df[numeric_cols] = df[numeric_cols].replace("", np.nan)
df = df.drop(df[(df["Locataires"] == "") & (df["Période"] == "")].index)
return df.astype(DF_TYPES)

View File

@@ -5,6 +5,7 @@ from pathlib import Path
import click
from .extract import extract_save
from .join import join_excel
logging_config = dict(
version=1,
@@ -64,5 +65,29 @@ def all(src, dest):
@main.command()
@click.option("--src", help="Tous les fichiers dans src", default="./")
@click.option("--dest", help="Où mettre les fichiers produits", default="")
def join(src, dest):
join_excel(src, dest, df_names=["charge", "locataire"])
@click.option(
"--force",
help="Ecraser si le ficher destination existe.",
default=False,
is_flag=True,
)
def join(src, dest, force):
"""Join tous les fichiers excel charge (resp locataire) de src dans un seul fichier charge.xlsx dans dist.
Exemple:
pdf-oralia join --src <dossier_source> --dest <dossier_destination>
"""
dest_charge = f"{dest}/charge.xlsx"
if not force and Path(dest_charge).exists():
raise ValueError(f"The file {dest_charge} already exists")
dest_locataire = f"{dest}/locataire.xlsx"
if not force and Path(dest_locataire).exists():
raise ValueError(f"The file {dest_locataire} already exists")
join_excel(src, dest_charge, "*_charge.xlsx")
logging.info(f"Les données charges ont été concaténées dans {dest_charge}")
join_excel(src, dest_locataire, "*_locataire.xlsx")
logging.info(f"Les données locataires ont été concaténées dans {dest_locataire}")

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdf-oralia"
version = "VERSION_PLACEHOLDER"
version = "0"
description = ""
authors = ["Bertrand Benjamin <benjamin.bertrand@opytex.org>"]
readme = "README.md"