Fix: remove index in excel outputs

Feat: logging and option about overwritting
Fix: rebuild join_excel
2023-10-05 15:22:14 +02:00 · 2023-10-05 15:19:16 +02:00 · 2023-10-05 15:10:39 +02:00 · 2023-09-20 09:37:50 +02:00 · 2023-09-20 09:28:57 +02:00 · 2023-09-20 09:27:40 +02:00
6 changed files with 68 additions and 10 deletions
--- a/.drone.yml
+++ b/.drone.yml
@@ -22,10 +22,18 @@ steps:
      PYPI_TOKEN:
        from_secret: pypi_token

-    when:
-      event:
-        include:
-          - tag
+  - name: Notify on matrix
+    image: plugins/matrix
+    environment:
+      MATRIX_ROOMID:
+        from_secret: MATRIX_ROOMID
+      MATRIX_ACCESSTOKEN:
+        from_secret: MATRIX_ACCESSTOKEN
+      MATRIX_USERID:
+        from_secret: MATRIX_USERID
+    settings:
+      homeserver: https://matrix.poneyworld.net
+      template: "Une nouvelle version (${DRONE_TAG}) de pdf-oralia est publiée!"

 # Déclencheur de la pipeline
 trigger:
--- a/pdf_oralia/extract.py
+++ b/pdf_oralia/extract.py
@@ -45,7 +45,7 @@ def from_pdf(pdf):
    charge_tables = []
    patrimoie_tables = []

-    for page in pdf.pages:
+    for page_number, page in enumerate(pdf.pages):
        page_text = page.extract_text()
        date = extract_date(page_text)
        additionnal_fields = {
@@ -76,7 +76,7 @@ def from_pdf(pdf):
            pass

        else:
-            raise ValueError("Page non reconnu")
+            logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")

    df_charge = charge.table2df(recapitulatif_tables + charge_tables)
    df_loc = locataire.table2df(loc_tables)
--- a/pdf_oralia/join.py
+++ b/pdf_oralia/join.py
@@ -0,0 +1,22 @@
+import glob
+
+import pandas as pd
+
+
+def join_excel(src, dest, file_pattern):
+    """Join every excel file in arc respecting file_pattern into on unique file in dist"""
+    filenames = list_files(src, file_pattern)
+    dfs = extract_dfs(filenames)
+    joined_df = pd.concat(dfs)
+    joined_df.to_excel(dest, index=False)
+
+
+def list_files(src, file_glob):
+    return list(glob.iglob(f"{src}/{file_glob}"))
+
+
+def extract_dfs(filenames):
+    dfs = []
+    for filename in filenames:
+        dfs.append(pd.read_excel(filename))
+    return dfs
--- a/pdf_oralia/pages/charge.py
+++ b/pdf_oralia/pages/charge.py
@@ -32,7 +32,10 @@ def is_it(page_text):
 def get_lot(txt):
    """Return lot number from "RECAPITULATIF DES OPERATIONS" """
    regex = r"[BSM](\d+)(?=\s*-)"
-    result = re.findall(regex, txt)
+    try:
+        result = re.findall(regex, txt)
+    except TypeError:
+        return "*"
    if result:
        return "{:02d}".format(int(result[0]))
    return "*"
--- a/pdf_oralia/scripts.py
+++ b/pdf_oralia/scripts.py
@@ -5,6 +5,7 @@ from pathlib import Path
 import click

 from .extract import extract_save
+from .join import join_excel

 logging_config = dict(
    version=1,
@@ -64,5 +65,29 @@ def all(src, dest):
@main.command()
@click.option("--src", help="Tous les fichiers dans src", default="./")
@click.option("--dest", help="Où mettre les fichiers produits", default="")
-def join(src, dest):
-    join_excel(src, dest, df_names=["charge", "locataire"])
+@click.option(
+    "--force",
+    help="Ecraser si le ficher destination existe.",
+    default=False,
+    is_flag=True,
+)
+def join(src, dest, force):
+    """Join tous les fichiers excel charge (resp locataire) de src dans un seul fichier charge.xlsx dans dist.
+
+    Exemple:
+
+        pdf-oralia join --src <dossier_source> --dest <dossier_destination>
+
+
+    """
+    dest_charge = f"{dest}/charge.xlsx"
+    if not force and Path(dest_charge).exists():
+        raise ValueError(f"The file {dest_charge} already exists")
+    dest_locataire = f"{dest}/locataire.xlsx"
+    if not force and Path(dest_locataire).exists():
+        raise ValueError(f"The file {dest_locataire} already exists")
+
+    join_excel(src, dest_charge, "*_charge.xlsx")
+    logging.info(f"Les données charges ont été concaténées dans {dest_charge}")
+    join_excel(src, dest_locataire, "*_locataire.xlsx")
+    logging.info(f"Les données locataires ont été concaténées dans {dest_locataire}")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdf-oralia"
-version = "dev"
+version = "0"
 description = ""
 authors = ["Bertrand Benjamin <benjamin.bertrand@opytex.org>"]
 readme = "README.md"
Author	SHA1	Message	Date
Bertrand Benjamin	48e75358ac	Fix: remove index in excel outputs	2023-10-05 15:22:14 +02:00
Bertrand Benjamin	132e37267b	Feat: logging and option about overwritting	2023-10-05 15:19:16 +02:00
Bertrand Benjamin	f2bcf6241a	Fix: rebuild join_excel	2023-10-05 15:10:39 +02:00
Bertrand Benjamin	ec9cc19be5	fix: remove when	2023-09-20 09:37:50 +02:00
Bertrand Benjamin	0040dccd9a	Feat: Handle get_lot when RECAPITULATIF is nan	2023-09-20 09:28:57 +02:00
Bertrand Benjamin	b0333cddd8	fix: raise a warning when a page is not recognized	2023-09-20 09:27:40 +02:00
Bertrand Benjamin	406b89fea1	Feat: publish tag on Matrix	2023-07-08 09:08:09 +02:00
Bertrand Benjamin	812d392720	feat: publish to matrix All checks were successful continuous-integration/drone/push Build is passing Details	2023-07-08 09:06:25 +02:00