core: change pandas version

Feat: make from_pdf importable and move plumber in it
Merge pull request 'Update dependency MarkupSafe to v2.1.5' (#17 ) from renovate/markupsafe-2.x into main
2024-10-16 06:47:55 +02:00 · 2024-10-16 06:47:25 +02:00 · 2024-02-17 05:08:19 +00:00 · 2024-02-17 05:08:08 +00:00 · 2024-02-17 05:07:48 +00:00 · 2024-02-17 05:04:52 +00:00
11 changed files with 2020 additions and 1203 deletions
--- a/.drone.yml
+++ b/.drone.yml
@@ -12,7 +12,7 @@ steps:
    image: python:3.11
    commands:
      - echo ${DRONE_TAG}
-      - sed -i "s/VERSION_PLACEHOLDER/${DRONE_TAG}/g" pyproject.toml
+      - sed -i 's/version = "[^"]*"/version = "${DRONE_TAG}"/g' pyproject.toml
      - curl -sSL https://install.python-poetry.org | python3 -
      - export PATH="/root/.local/bin:$PATH"
      - poetry --version
@@ -22,10 +22,18 @@ steps:
      PYPI_TOKEN:
        from_secret: pypi_token

-    when:
-      event:
-        include:
-          - tag
+  - name: Notify on matrix
+    image: plugins/matrix
+    environment:
+      MATRIX_ROOMID:
+        from_secret: MATRIX_ROOMID
+      MATRIX_ACCESSTOKEN:
+        from_secret: MATRIX_ACCESSTOKEN
+      MATRIX_USERID:
+        from_secret: MATRIX_USERID
+    settings:
+      homeserver: https://matrix.poneyworld.net
+      template: "Une nouvelle version (${DRONE_TAG}) de pdf-oralia est publiée!"

 # Déclencheur de la pipeline
 trigger:
--- a/pdf_oralia/init.py
+++ b/pdf_oralia/init.py
@@ -0,0 +1 @@
+from .extract import from_pdf
--- a/pdf_oralia/extract.py
+++ b/pdf_oralia/extract.py
@@ -38,14 +38,15 @@ def catch_malformed_table(tables):
    return tables[0]


-def from_pdf(pdf):
+def from_pdf(pdf_file):
    """Build dataframes one about charges and another on loc"""
+    pdf = pdfplumber.open(pdf_file)
    recapitulatif_tables = []
    loc_tables = []
    charge_tables = []
    patrimoie_tables = []

-    for page in pdf.pages:
+    for page_number, page in enumerate(pdf.pages):
        page_text = page.extract_text()
        date = extract_date(page_text)
        additionnal_fields = {
@@ -76,7 +77,7 @@ def from_pdf(pdf):
            pass

        else:
-            raise ValueError("Page non reconnu")
+            logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")

    df_charge = charge.table2df(recapitulatif_tables + charge_tables)
    df_loc = locataire.table2df(loc_tables)
@@ -90,8 +91,7 @@ def extract_save(pdf_file, dest):
    xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
    xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"

-    pdf = pdfplumber.open(pdf_file)
-    df_charge, df_loc = from_pdf(pdf)
+    df_charge, df_loc = from_pdf(pdf_file)

    df_charge.to_excel(xls_charge, sheet_name="Charges", index=False)
    logging.info(f"{xls_charge} saved")
--- a/pdf_oralia/join.py
+++ b/pdf_oralia/join.py
@@ -0,0 +1,29 @@
+import glob
+import logging
+
+import pandas as pd
+
+
+def join_excel(src, dest, file_pattern):
+    """Join every excel file in arc respecting file_pattern into on unique file in dist"""
+    filenames = list_files(src, file_pattern)
+    logging.debug(f"Concatenate {filenames}")
+    dfs = extract_dfs(filenames)
+    joined_df = pd.concat(dfs)
+    logging.debug(f"Writing joined excel to {dest}")
+    joined_df.to_excel(dest, index=False)
+    logging.debug(f"with {len(joined_df)} rows")
+
+
+def list_files(src, file_glob):
+    return list(glob.iglob(f"{src}/{file_glob}"))
+
+
+def extract_dfs(filenames):
+    dfs = []
+    for filename in filenames:
+        logging.debug(f"Extracting {filename}")
+        df = pd.read_excel(filename)
+        logging.debug(f"Found {len(df)} rows")
+        dfs.append(df)
+    return dfs
--- a/pdf_oralia/pages/charge.py
+++ b/pdf_oralia/pages/charge.py
@@ -3,7 +3,21 @@ import re
 import numpy as np
 import pandas as pd

-RECAPITULATIF_DES_OPERATION = 1
+RECAPITULATIF_DES_OPERATIONS = 1
+DF_TYPES = {
+    "Fournisseur": str,
+    "RECAPITULATIF DES OPERATIONS": str,
+    "Débits": float,
+    "Crédits": float,
+    "Dont T.V.A.": float,
+    "Locatif": float,
+    "Déductible": float,
+    "immeuble": str,
+    "mois": str,
+    "annee": str,
+    "lot": str,
+}
+DEFAULT_FOURNISSEUR = "ROSIER MODICA MOTTEROZ SA"


 def is_it(page_text):
@@ -17,8 +31,11 @@ def is_it(page_text):

 def get_lot(txt):
    """Return lot number from "RECAPITULATIF DES OPERATIONS" """
-    regex = r"[BSM](\d+)\s-"
+    regex = r"[BSM](\d+)(?=\s*-)"
+    try:
        result = re.findall(regex, txt)
+    except TypeError:
+        return "*"
    if result:
        return "{:02d}".format(int(result[0]))
    return "*"
@@ -27,14 +44,14 @@ def get_lot(txt):
 def keep_row(row):
    return not any(
        [
-            word.lower() in row[RECAPITULATIF_DES_OPERATION].lower()
+            word.lower() in row[RECAPITULATIF_DES_OPERATIONS].lower()
            for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
        ]
    )


 def extract(table, additionnal_fields: dict = {}):
-    """Turn table to dictionary with additionnal fields"""
+    """Turn table to dictionary with additional fields"""
    extracted = []
    header = table[0]
    for row in table[1:]:
@@ -49,10 +66,8 @@ def extract(table, additionnal_fields: dict = {}):
            for k, v in additionnal_fields.items():
                r[k] = v

-            r["lot"] = get_lot(row[RECAPITULATIF_DES_OPERATION])
-
-            if "honoraire" in row[RECAPITULATIF_DES_OPERATION]:
-                r["Fournisseur"] = "IMI GERANCE"
+            if "honoraire" in row[RECAPITULATIF_DES_OPERATIONS].lower():
+                r["Fournisseur"] = DEFAULT_FOURNISSEUR

            extracted.append(r)

@@ -69,4 +84,8 @@ def table2df(tables):
        )
        df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
        dfs.append(df)
-    return pd.concat(dfs)
+    df = pd.concat(dfs)
+
+    df["immeuble"] = df["immeuble"].apply(lambda x: x[0].capitalize())
+    df["lot"] = df["RECAPITULATIF DES OPERATIONS"].apply(get_lot)
+    return df.astype(DF_TYPES)
--- a/pdf_oralia/pages/locataire.py
+++ b/pdf_oralia/pages/locataire.py
@@ -1,5 +1,23 @@
+import numpy as np
 import pandas as pd

+DF_TYPES = {
+    "Locataires": str,
+    "Période": str,
+    "Loyers": float,
+    "Taxes": float,
+    "Provisions": float,
+    "Divers": str,
+    "Total": float,
+    "Réglés": float,
+    "Impayés": float,
+    "immeuble": str,
+    "mois": str,
+    "annee": str,
+    "Lot": str,
+    "Type": str,
+}
+

 def is_it(page_text):
    if "SITUATION DES LOCATAIRES" in page_text:
@@ -16,7 +34,7 @@ def is_drop(row):


 def extract(table, additionnal_fields: dict = {}):
-    """Turn table to dictionary with additionnal fields"""
+    """Turn table to dictionary with additional fields"""
    extracted = []
    header = table[0]
    for row in table[1:]:
@@ -67,6 +85,12 @@ def parse_lot(string):
    return {"Lot": "{:02d}".format(int(words[1])), "Type": " ".join(words[2:])}


+def clean_type(string):
+    if "appartement" in string.lower():
+        return string[-2:]
+    return string
+
+
 def join_row(table):
    joined = []
    for row in table:
@@ -115,8 +139,6 @@ def join_row(table):
                    }
                )
                joined.append(row)
-            else:
-                print(row)

    return joined

@@ -131,4 +153,14 @@ def flat_tables(tables):
 def table2df(tables):
    tables = flat_tables(tables)
    joined = join_row(tables)
-    return pd.DataFrame.from_records(joined)
+    df = pd.DataFrame.from_records(joined)
+
+    df["immeuble"] = df["immeuble"].apply(lambda x: x[0].capitalize())
+    df["Type"] = df["Type"].apply(clean_type)
+
+    numeric_cols = [k for k, v in DF_TYPES.items() if v == float]
+    df[numeric_cols] = df[numeric_cols].replace("", np.nan)
+
+    df = df.drop(df[(df["Locataires"] == "") & (df["Période"] == "")].index)
+
+    return df.astype(DF_TYPES)
--- a/pdf_oralia/scripts.py
+++ b/pdf_oralia/scripts.py
@@ -3,11 +3,18 @@ from logging.config import dictConfig
 from pathlib import Path

 import click
-import pandas as pd

 from .extract import extract_save
 from .join import join_excel

+
+@click.group()
+@click.option("--debug/--no-debug", default=False)
+def main(debug):
+    if debug:
+        logging_level = logging.DEBUG
+    else:
+        logging_level = logging.INFO
    logging_config = dict(
        version=1,
        formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
@@ -15,23 +22,18 @@ logging_config = dict(
            "h": {
                "class": "logging.StreamHandler",
                "formatter": "f",
-            "level": logging.DEBUG,
+                "level": logging_level,
            }
        },
        root={
            "handlers": ["h"],
-        "level": logging.DEBUG,
+            "level": logging_level,
        },
    )

    dictConfig(logging_config)


-@click.group()
-def main():
-    pass
-
-
@main.group()
 def extract():
    pass
@@ -66,5 +68,31 @@ def all(src, dest):
@main.command()
@click.option("--src", help="Tous les fichiers dans src", default="./")
@click.option("--dest", help="Où mettre les fichiers produits", default="")
-def join(src, dest):
-    join_excel(src, dest, df_names=["charge", "locataire"])
+@click.option(
+    "--force",
+    help="Ecraser si le ficher destination existe.",
+    default=False,
+    is_flag=True,
+)
+def join(src, dest, force):
+    """Join tous les fichiers excel charge (resp locataire) de src dans un seul fichier charge.xlsx dans dist.
+
+    Exemple:
+
+        pdf-oralia join --src <dossier_source> --dest <dossier_destination>
+
+
+    """
+    dest_charge = f"{dest}/charge.xlsx"
+    if not force and Path(dest_charge).exists():
+        raise ValueError(f"The file {dest_charge} already exists")
+    dest_locataire = f"{dest}/locataire.xlsx"
+    if not force and Path(dest_locataire).exists():
+        raise ValueError(f"The file {dest_locataire} already exists")
+
+    if not Path(src).exists():
+        raise ValueError(f"The source directory ({src}) does not exists.")
+    join_excel(src, dest_charge, "*_charge.xlsx")
+    logging.info(f"Les données charges ont été concaténées dans {dest_charge}")
+    join_excel(src, dest_locataire, "*_locataire.xlsx")
+    logging.info(f"Les données locataires ont été concaténées dans {dest_locataire}")
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdf-oralia"
-version = "VERSION_PLACEHOLDER"
+version = "0"
 description = ""
 authors = ["Bertrand Benjamin <benjamin.bertrand@opytex.org>"]
 readme = "README.md"
@@ -13,7 +13,7 @@ pdf-oralia = "pdf_oralia.scripts:main"
 python = "^3.10"
 click = "^8.1.3"
 pdfplumber = "^0.7.4"
-pandas = "^1.5.0"
+pandas = "^2.2.3"
 openpyxl = "^3.0.10"


--- a/renovate.json
+++ b/renovate.json
@@ -0,0 +1,2 @@
+{
+}
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,76 +1,3 @@
-argon2-cffi==21.3.0
-argon2-cffi-bindings==21.2.0
-asttokens==2.0.8
-attrs==22.1.0
-backcall==0.2.0
-beautifulsoup4==4.11.1
-bleach==5.0.1
-cffi==1.15.1
-charset-normalizer==2.1.1
-cryptography==38.0.1
-debugpy==1.6.3
-decorator==5.1.1
-defusedxml==0.7.1
-entrypoints==0.4
-et-xmlfile==1.1.0
-executing==1.1.0
-fastjsonschema==2.16.2
-ipykernel==6.16.0
-ipython==8.5.0
-ipython-genutils==0.2.0
-ipywidgets==8.0.2
-jedi==0.18.1
-Jinja2==3.1.2
-jsonschema==4.16.0
-jupyter==1.0.0
-jupyter-console==6.4.4
-jupyter-core==4.11.1
-jupyter_client==7.3.5
-jupyterlab-pygments==0.2.2
-jupyterlab-widgets==3.0.3
-lxml==4.9.1
-MarkupSafe==2.1.1
-matplotlib-inline==0.1.6
-mistune==2.0.4
-nbclient==0.6.8
-nbconvert==7.0.0
-nbformat==5.6.1
-nest-asyncio==1.5.5
-notebook==6.4.12
-numpy==1.23.3
-openpyxl==3.0.10
-packaging==21.3
-pandas==1.5.0
-pandocfilters==1.5.0
-parso==0.8.3
-pdfminer.six==20220524
-pdfplumber==0.7.4
-pexpect==4.8.0
-pickleshare==0.7.5
-Pillow==9.2.0
-prometheus-client==0.14.1
-prompt-toolkit==3.0.31
-psutil==5.9.2
-ptyprocess==0.7.0
-pure-eval==0.2.2
-pycparser==2.21
-Pygments==2.13.0
-pyparsing==3.0.9
-pyrsistent==0.18.1
-python-dateutil==2.8.2
-pytz==2022.2.1
-pyzmq==24.0.1
-qtconsole==5.3.2
-QtPy==2.2.0
-Send2Trash==1.8.0
-six==1.16.0
-soupsieve==2.3.2.post1
-stack-data==0.5.1
-terminado==0.15.0
-tinycss2==1.1.1
-tornado==6.2
-traitlets==5.4.0
-Wand==0.6.10
-wcwidth==0.2.5
-webencodings==0.5.1
-widgetsnbextension==4.0.3
+pdfplumber
+numpy
+pandas
Author	SHA1	Message	Date
Bertrand Benjamin	6e0ffe9085	core: change pandas version	2024-10-16 06:47:55 +02:00
Bertrand Benjamin	ab2fdb0541	Feat: make from_pdf importable and move plumber in it	2024-10-16 06:47:25 +02:00
Bertrand Benjamin	0fc39ed317	Merge pull request 'Update dependency MarkupSafe to v2.1.5' (#17 ) from renovate/markupsafe-2.x into main Reviewed-on: #17	2024-02-17 05:08:19 +00:00
Bertrand Benjamin	a6d6681756	Merge branch 'main' into renovate/markupsafe-2.x	2024-02-17 05:08:08 +00:00
Bertrand Benjamin	4eecb3a44c	Merge pull request 'Update dependency Jinja2 to v3.1.3' (#16 ) from renovate/jinja2-3.x into main Reviewed-on: #16	2024-02-17 05:07:48 +00:00
Renovate Bot	60da623323	Update dependency MarkupSafe to v2.1.5	2024-02-17 05:04:52 +00:00
Renovate Bot	1f1e3e2741	Update dependency Jinja2 to v3.1.3	2024-02-17 05:04:48 +00:00
Bertrand Benjamin	2b3e935f39	Merge pull request 'Update dependency Send2Trash to v1.8.2' (#15 ) from renovate/send2trash-1.x into main Reviewed-on: #15	2024-02-17 04:56:16 +00:00
Bertrand Benjamin	ef63f22d44	Merge pull request 'Update dependency MarkupSafe to v2.1.3' (#14 ) from renovate/markupsafe-2.x into main Reviewed-on: #14	2024-02-17 04:55:55 +00:00
Renovate Bot	1020ef9257	Update dependency Send2Trash to v1.8.2	2024-01-10 11:04:32 +00:00
Renovate Bot	39084ceebd	Update dependency MarkupSafe to v2.1.3	2024-01-10 11:04:30 +00:00
Bertrand Benjamin	7de6c8dd9c	clean renovate.json	2024-01-10 10:46:45 +00:00
Bertrand Benjamin	da3815eea6	activate renovate	2024-01-09 06:53:09 +00:00
Bertrand Benjamin	45d343d810	Feat: add raise error when src does not exists	2024-01-02 22:22:58 +01:00
Bertrand Benjamin	806227f202	Feat: add logging in join	2023-12-30 17:45:15 +01:00
Bertrand Benjamin	7bf0c38883	Feat: add option for debugging	2023-12-30 17:25:40 +01:00
Bertrand Benjamin	b15b059e2a	Add debug	2023-12-27 19:58:12 +01:00
Bertrand Benjamin	48e75358ac	Fix: remove index in excel outputs	2023-10-05 15:22:14 +02:00
Bertrand Benjamin	132e37267b	Feat: logging and option about overwritting	2023-10-05 15:19:16 +02:00
Bertrand Benjamin	f2bcf6241a	Fix: rebuild join_excel	2023-10-05 15:10:39 +02:00
Bertrand Benjamin	ec9cc19be5	fix: remove when	2023-09-20 09:37:50 +02:00
Bertrand Benjamin	0040dccd9a	Feat: Handle get_lot when RECAPITULATIF is nan	2023-09-20 09:28:57 +02:00
Bertrand Benjamin	b0333cddd8	fix: raise a warning when a page is not recognized	2023-09-20 09:27:40 +02:00
Bertrand Benjamin	406b89fea1	Feat: publish tag on Matrix	2023-07-08 09:08:09 +02:00
Bertrand Benjamin	812d392720	feat: publish to matrix All checks were successful continuous-integration/drone/push Build is passing Details	2023-07-08 09:06:25 +02:00
Bertrand Benjamin	6b77980e6c	Fix 7: change the default FOURNISSEUR	2023-07-07 21:26:00 +02:00
Bertrand Benjamin	90c2d3689b	Fix I4: drop row with "" on locataire ans Période	2023-07-05 18:13:41 +02:00
Bertrand Benjamin	f9be31c090	Fix #3 : replace empty string with np.nan	2023-07-05 17:49:25 +02:00
Bertrand Benjamin	2761c3ed7b	Feat: improve version name for drone	2023-06-30 13:51:04 +02:00
Bertrand Benjamin	5692898137	Feat: lot s'adapte meme sans espace avant le tiret	2023-06-28 10:49:36 +02:00
Bertrand Benjamin	44d4150910	Feat: remove Appartement in type	2023-06-28 10:44:56 +02:00
Bertrand Benjamin	223f25130d	Feat: type df columns	2023-06-28 10:30:40 +02:00
Bertrand Benjamin	1a86b7bc26	Fix: remove useless import	2023-06-28 09:45:18 +02:00