doc: add utilisation in readme

Feat: script command all works recursively
Feat: specify page type before extracting it
2025-02-26 09:08:17 +01:00 · 2025-02-26 09:02:33 +01:00 · 2025-02-26 05:58:38 +01:00 · 2025-02-26 05:54:44 +01:00 · 2024-10-16 06:47:55 +02:00 · 2024-10-16 06:47:25 +02:00
13 changed files with 3063 additions and 1979 deletions
--- a/pdf.ipynb
+++ b/pdf.ipynb
--- a/README.md
+++ b/README.md
@@ -1,3 +1,23 @@
 # PDF AURALIA
 Extraction de fichiers de comptabilité en pdf vers xlsx.
 ## Utilisation
 - Lancement sur un fichier pdf particulier
  ```bash
  pdf_oralia extract on <pdf_file> --dest <where to put producted files>
  ```
 - Lancement sur tous les fichiers d'un repertoire (récursivement )
  ```bash
  pdf_oralia extract all --src <source folder> --dest <destination folder>
  ```
  Cette commande reproduira la structure du dossier source dans destination. Seul les fichiers non existants seront traités. Par default, les fichiers déjà produits ne seront pas écrasés.
  On peut ajouter les options suivantes:
  - `--force`: pour écraser les fichiers déjà traités
  - `--only-plan`: pour voir quels fichiers pourraient être créé sans le faire.
--- a/pdf_oralia/init.py
+++ b/pdf_oralia/init.py
@@ -0,0 +1 @@
 from .extract import from_pdf
--- a/pdf_oralia/extract.py
+++ b/pdf_oralia/extract.py
@@ -1,10 +1,11 @@
 import logging
 from datetime import datetime
 from pathlib import Path
 import pandas as pd
 import pdfplumber
-from pdf_oralia.pages import charge, locataire, patrimoine, recapitulatif
+from pdf_oralia.pages import charge, locataire, patrimoine
 extract_table_settings = {
    "vertical_strategy": "lines",
@@ -32,68 +33,102 @@ def extract_building(page_text, buildings=["bloch", "marietton", "servient"]):
    raise ValueError("Pas d'immeuble trouvé")
-def catch_malformed_table(tables):
+def pdf_extract_tables_lines(pdf):
-    if len(tables) == 2:
+    loc_sink = locataire.fsm()
-        return tables[0] + tables[1]
+    next(loc_sink)
-    return tables[0]
+    charge_sink = charge.fsm()
-
+    next(charge_sink)
-
+    patrimoine_sink = patrimoine.fsm()
-def from_pdf(pdf):
+    next(patrimoine_sink)
    """Build dataframes one about charges and another on loc"""
    recapitulatif_tables = []
    loc_tables = []
    charge_tables = []
    patrimoie_tables = []
    for page_number, page in enumerate(pdf.pages):
        page_text = page.extract_text()
        date = extract_date(page_text)
-        additionnal_fields = {
+        try:
-            "immeuble": extract_building(page_text),
+            additionnal_fields = {
-            "mois": date.strftime("%m"),
+                "immeuble": extract_building(page_text),
-            "annee": date.strftime("%Y"),
+                "mois": date.strftime("%m"),
-        }
+                "annee": date.strftime("%Y"),
-
+            }
-        if recapitulatif.is_it(page_text):
+        except ValueError:
-            table = page.extract_tables()[0]
+            logging.warning(
-            extracted = recapitulatif.extract(table, additionnal_fields)
+                f"L'immeuble de la page {page_number+1} non identifiable. Page ignorée."
-            if extracted:
+            )
-                recapitulatif_tables.append(extracted)
+            continue
-
+        table_type = ""
-        elif locataire.is_it(page_text):
+        if locataire.is_it(page_text):
-            tables = page.extract_tables(extract_table_settings)[1:]
+            table_type = "locataire"
            table = catch_malformed_table(tables)
            extracted = locataire.extract(table, additionnal_fields)
            loc_tables.append(extracted)
        elif charge.is_it(page_text):
-            tables = page.extract_tables(extract_table_settings)[1:]
+            table_type = "charge"
            table = catch_malformed_table(tables)
            extracted = charge.extract(table, additionnal_fields)
            charge_tables.append(extracted)
        elif patrimoine.is_it(page_text):
-            pass
+            table_type = "patrimoine"
        else:
            logging.warning(
                f"Type de la page {page_number+1} non identifiable. Page ignorée."
            )
            continue
        for line in page.extract_table(extract_table_settings):
            if table_type == "locataire":
                res = loc_sink.send(line)
                if res:
                    res.update(additionnal_fields)
                    yield locataire.Line(**res)
            elif table_type == "charge":
                res = charge_sink.send(line)
                if res:
                    res.update(additionnal_fields)
                    yield charge.Line(**res)
            elif table_type == "patrimoine":
                res = patrimoine_sink.send(line)
                if res:
                    res.update(additionnal_fields)
                    yield patrimoine.Line(**res)
 def from_pdf(pdf_file):
    """Build dataframes one about charges and another on loc"""
    pdf = pdfplumber.open(pdf_file)
    locataire_lines = []
    charge_lines = []
    patrimoine_lines = []
    for line in pdf_extract_tables_lines(pdf):
        if isinstance(line, locataire.Line):
            locataire_lines.append(line)
        elif isinstance(line, charge.Line):
            charge_lines.append(line)
        elif isinstance(line, patrimoine.Line):
            patrimoine_lines.append(line)
        else:
            logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")
-    df_charge = charge.table2df(recapitulatif_tables + charge_tables)
+    return {
-    df_loc = locataire.table2df(loc_tables)
+        "charge": pd.DataFrame([c.__dict__ for c in charge_lines]),
-
+        "locataire": pd.DataFrame([c.__dict__ for c in locataire_lines]),
-    return df_charge, df_loc
+        "patrimoine": pd.DataFrame([c.__dict__ for c in patrimoine_lines]),
    }
-def extract_save(pdf_file, dest):
+def extract_plan(pdf_file, dest):
    return {
        "charge": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx",
        "locataire": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx",
        "patrimoine": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx",
    }
 def extract_save(pdf_file, dest, save=[]):
    """Extract charge and locataire for pdf_file and put xlsx file in dest"""
    pdf_file = Path(pdf_file)
-    xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
+    xlss = extract_plan(pdf_file, dest)
    xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
-    pdf = pdfplumber.open(pdf_file)
+    if save != []:
-    df_charge, df_loc = from_pdf(pdf)
+        dfs = from_pdf(pdf_file)
-    df_charge.to_excel(xls_charge, sheet_name="Charges", index=False)
+        for s in save:
-    logging.info(f"{xls_charge} saved")
+            dfs[s].to_excel(xlss[s], sheet_name=s, index=False)
-    df_loc.to_excel(xls_locataire, sheet_name="Location", index=False)
+            logging.info(f"{xlss[s]} saved")
-    logging.info(f"{xls_locataire} saved")
+        return {k: v for k, v in xlss.items() if k in save}
    return xlss
--- a/pdf_oralia/join.py
+++ b/pdf_oralia/join.py
@@ -1,4 +1,5 @@
 import glob
 import logging
 import pandas as pd
@@ -6,9 +7,12 @@ import pandas as pd
 def join_excel(src, dest, file_pattern):
    """Join every excel file in arc respecting file_pattern into on unique file in dist"""
    filenames = list_files(src, file_pattern)
    logging.debug(f"Concatenate {filenames}")
    dfs = extract_dfs(filenames)
    joined_df = pd.concat(dfs)
    logging.debug(f"Writing joined excel to {dest}")
    joined_df.to_excel(dest, index=False)
    logging.debug(f"with {len(joined_df)} rows")
 def list_files(src, file_glob):
@@ -18,5 +22,8 @@ def list_files(src, file_glob):
 def extract_dfs(filenames):
    dfs = []
    for filename in filenames:
-        dfs.append(pd.read_excel(filename))
+        logging.debug(f"Extracting {filename}")
        df = pd.read_excel(filename)
        logging.debug(f"Found {len(df)} rows")
        dfs.append(df)
    return dfs
--- a/pdf_oralia/pages/charge.py
+++ b/pdf_oralia/pages/charge.py
@@ -1,9 +1,16 @@
 import re
 from pydantic import BaseModel, field_validator
 import numpy as np
 import pandas as pd
-RECAPITULATIF_DES_OPERATIONS = 1
+HEADER_CHARGE = [
    "",
    "RECAPITULATIF DES OPERATIONS",
    "Débits",
    "Crédits",
    "Dont T.V.A.",
    "Locatif",
    "Déductible",
 ]
 DF_TYPES = {
    "Fournisseur": str,
    "RECAPITULATIF DES OPERATIONS": str,
@@ -17,7 +24,30 @@ DF_TYPES = {
    "annee": str,
    "lot": str,
 }
-DEFAULT_FOURNISSEUR = "ROSIER MODICA MOTTEROZ SA"
+
 class Line(BaseModel):
    mois: int
    annee: int
    immeuble: str
    lot: str
    Champs: str
    Categorie: str
    Fournisseur: str
    Libellé: str
    Débit: float
    Crédits: float
    Dont_TVA: float
    Locatif: float
    Déductible: float
    @field_validator(
        "Débit", "Crédits", "Dont_TVA", "Locatif", "Déductible", mode="before"
    )
    def set_default_if_empty(cls, v):
        if v == "":
            return 0
        return v
 def is_it(page_text):
@@ -41,51 +71,54 @@ def get_lot(txt):
    return "*"
-def keep_row(row):
+def fsm():
-    return not any(
+    current_state = "total"
-        [
+    row = {}
-            word.lower() in row[RECAPITULATIF_DES_OPERATIONS].lower()
+    line = yield
-            for word in ["TOTAL", "TOTAUX", "Solde créditeur", "Solde débiteur"]
+    while True:
-        ]
+        if line == HEADER_CHARGE:
-    )
+            line = yield
-
+        if current_state == "total":
-
+            if line[1].lower().split(" ")[0] in ["total", "totaux"]:
-def extract(table, additionnal_fields: dict = {}):
+                current_state = "new_champs"
-    """Turn table to dictionary with additional fields"""
+            line = yield
-    extracted = []
+        elif current_state == "new_champs":
-    header = table[0]
+            if line[0] != "":
-    for row in table[1:]:
+                current_state = "new_cat_line"
-        if keep_row(row):
+                row = {"Champs": line[0], "Categorie": "", "Fournisseur": ""}
-            r = dict()
+            line = yield
-            for i, value in enumerate(row):
+        elif current_state == "new_cat_line":
-                if header[i] == "":
+            if line[1].lower().split(" ")[0] in ["total", "totaux"]:
-                    r["Fournisseur"] = value
+                current_state = "new_champs"
-                else:
+                line = yield
-                    r[header[i]] = value
+                row = {}
-
+            elif line[2] != "" or line[3] != "":
-            for k, v in additionnal_fields.items():
+                row.update(
-                r[k] = v
+                    {
-
+                        "Fournisseur": line[0] if line[0] != "" else row["Fournisseur"],
-            if "honoraire" in row[RECAPITULATIF_DES_OPERATIONS].lower():
+                        "Libellé": line[1],
-                r["Fournisseur"] = DEFAULT_FOURNISSEUR
+                        "lot": get_lot(line[1]),
-
+                        "Débit": line[2],
-            extracted.append(r)
+                        "Crédits": line[3],
-
+                        "Dont_TVA": line[4],
-    return extracted
+                        "Locatif": line[5],
-
+                        "Déductible": line[6],
-
+                    }
-def table2df(tables):
+                )
-    dfs = []
+                line = yield row
-    for table in tables:
+                row = {
-        df = (
+                    "Champs": row["Champs"],
-            pd.DataFrame.from_records(table)
+                    "Categorie": row["Categorie"],
-            .replace("", np.nan)
+                    "Fournisseur": row["Fournisseur"],
-            .dropna(subset=["Débits", "Crédits"], how="all")
+                }
-        )
+            elif line[0] != "" and line[1] == "":
-        df["Fournisseur"] = df["Fournisseur"].fillna(method="ffill")
+                row.update({"Categorie": line[0]})
-        dfs.append(df)
+                line = yield
-    df = pd.concat(dfs)
+            elif line[1] != "":
-
+                row.update({"Categorie": line[1]})
-    df["immeuble"] = df["immeuble"].apply(lambda x: x[0].capitalize())
+                line = yield
-    df["lot"] = df["RECAPITULATIF DES OPERATIONS"].apply(get_lot)
+            elif line[0] != "":
-    return df.astype(DF_TYPES)
+                row.update({"Fournisseur": line[0]})
                line = yield
            else:
                line = yield
--- a/pdf_oralia/pages/locataire.py
+++ b/pdf_oralia/pages/locataire.py
@@ -1,22 +1,48 @@
-import numpy as np
+from pydantic import BaseModel, field_validator
 import pandas as pd
-DF_TYPES = {
+HEADER_LOC = [
-    "Locataires": str,
+    "Locataires",
-    "Période": str,
+    "Période",
-    "Loyers": float,
+    "Loyers",
-    "Taxes": float,
+    "Taxes",
-    "Provisions": float,
+    "Provisions",
-    "Divers": str,
+    "Divers",
-    "Total": float,
+    "",
-    "Réglés": float,
+    "Total",
-    "Impayés": float,
+    "Réglés",
-    "immeuble": str,
+    "Impayés",
-    "mois": str,
+]
-    "annee": str,
+
-    "Lot": str,
+
-    "Type": str,
+class Line(BaseModel):
-}
+    mois: int
    annee: int
    immeuble: str
    Lot: str
    Type: str
    Locataire: str
    Loyers: float
    Taxes: float
    Provisions: float
    Divers: float
    Total: float
    Réglés: float
    Impayés: float
    @field_validator(
        "Loyers",
        "Taxes",
        "Provisions",
        "Divers",
        "Total",
        "Réglés",
        "Impayés",
        mode="before",
    )
    def set_default_if_empty(cls, v):
        if v == "":
            return 0
        return v
 def is_it(page_text):
@@ -25,142 +51,43 @@ def is_it(page_text):
    return False
 def is_drop(row):
    if "totaux" in row[0].lower():
        return True
    if not any(row):
        return True
    return False
 def extract(table, additionnal_fields: dict = {}):
    """Turn table to dictionary with additional fields"""
    extracted = []
    header = table[0]
    for row in table[1:]:
        if not is_drop(row):
            r = dict()
            for i, value in enumerate(row):
                if header[i] != "":
                    r[header[i]] = value
            for k, v in additionnal_fields.items():
                r[k] = v
            extracted.append(r)
    return extracted
 def join_row(last, next):
    row = {}
    for key in last:
        if last[key] == next[key]:
            row[key] = last[key]
        elif last[key] and next[key]:
            row[key] = f"{last[key]}\n{next[key]}"
        elif last[key]:
            row[key] = last[key]
        elif next[key]:
            row[key] = next[key]
        else:
            row[key] = ""
    return row
 def join_tables(tables):
    joined = tables[0]
    for t in tables[1:]:
        last_row = joined[-1]
        if "totaux" not in last_row["Locataires"].lower():
            first_row = t[0]
            joined_row = join_row(last_row, first_row)
            joined = joined[:-1] + [joined_row] + t[1:]
        else:
            joined += t
    return joined
 def parse_lot(string):
    words = string.split(" ")
    return {"Lot": "{:02d}".format(int(words[1])), "Type": " ".join(words[2:])}
-def clean_type(string):
+def fsm():
-    if "appartement" in string.lower():
+    current_state = "new_row"
-        return string[-2:]
+    row = {}
-    return string
+    line = yield
-
+    while True:
-
+        if line == HEADER_LOC:
-def join_row(table):
+            line = yield
-    joined = []
+        elif current_state == "new_row":
-    for row in table:
+            if line[0] != "" and line[0] != "TOTAUX":
-        if row["Locataires"].startswith("Lot"):
+                row.update(parse_lot(line[0]))
-            row.update(parse_lot(row["Locataires"]))
+                current_state = "add_loc"
-            row["Locataires"] = ""
+            line = yield
-            joined.append(row)
+        elif current_state == "add_loc":
-        elif row["Locataires"] == "Rappel de Loyer":
+            if line[0] != "":
-            last_row = joined[-1]
+                row["Locataire"] = line[0]
-            row.update(
+                current_state = "add_totaux"
-                {
+            line = yield
-                    "Lot": last_row["Lot"],
+        elif current_state == "add_totaux":
-                    "Type": last_row["Type"],
+            if line[0] == "Totaux":
                    "Locataires": last_row["Locataires"],
                    "Divers": "Rappel de Loyer",
                }
            )
            joined.append(row)
        elif row["Locataires"]:
            last_row = joined.pop()
            row_name = row["Locataires"].replace("\n", " ")
            row.update({k: v for k, v in last_row.items() if v})
            row["Locataires"] = last_row["Locataires"] + " " + row_name
            joined.append(row)
        else:
            if row["Période"].startswith("Solde"):
                last_row = joined.pop()
                row.update(
                    {
-                        "Lot": last_row["Lot"],
+                        "Loyers": line[2],
-                        "Type": last_row["Type"],
+                        "Taxes": line[3],
-                        "Locataires": last_row["Locataires"],
+                        "Provisions": line[4],
                        "Divers": line[5],
                        "Total": line[7],
                        "Réglés": line[8],
                        "Impayés": line[9],
                    }
                )
-                joined.append(row)
+                line = yield row
-
+                row = {}
-            elif row["Période"].startswith("Du"):
+                current_state = "new_row"
-                last_row = joined[-1]
+            else:
-                row.update(
+                line = yield
                    {
                        "Lot": last_row["Lot"],
                        "Type": last_row["Type"],
                        "Locataires": last_row["Locataires"],
                    }
                )
                joined.append(row)
    return joined
 def flat_tables(tables):
    tables_flat = []
    for table in tables:
        tables_flat.extend(table)
    return tables_flat
 def table2df(tables):
    tables = flat_tables(tables)
    joined = join_row(tables)
    df = pd.DataFrame.from_records(joined)
    df["immeuble"] = df["immeuble"].apply(lambda x: x[0].capitalize())
    df["Type"] = df["Type"].apply(clean_type)
    numeric_cols = [k for k, v in DF_TYPES.items() if v == float]
    df[numeric_cols] = df[numeric_cols].replace("", np.nan)
    df = df.drop(df[(df["Locataires"] == "") & (df["Période"] == "")].index)
    return df.astype(DF_TYPES)
--- a/pdf_oralia/pages/patrimoine.py
+++ b/pdf_oralia/pages/patrimoine.py
@@ -1,4 +1,74 @@
 from pydantic import BaseModel, field_validator
 HEADER_PATRIMOINE = [
    "Etage",
    "Lots",
    "Type de lot",
    "Nom du Locataire",
    "Loyer Annuel",
    "Début Bail",
    "Fin Bail",
    "Entrée",
    "Départ",
    "Révisé le",
    "U",
    "Dépôt Gar.",
 ]
 class Line(BaseModel):
    mois: int
    annee: int
    immeuble: str
    Etage: str
    Lot: str
    Type: str
    Locataire: str
    Loyer_annuel: int
    Debut_bail: str
    Fin_bail: str
    Entree: str
    Depart: str
    Revision_bail: str
    Usage: str
    Depot_garantie: float
    @field_validator("Loyer_annuel", "Depot_garantie", mode="before")
    def set_default_if_empty(cls, v):
        if v == "":
            return 0
        return v
 def is_it(page_text):
    if "VOTRE PATRIMOINE" in page_text:
        return True
    return False
 def fsm():
    current_state = "new_line"
    row = {}
    line = yield
    while True:
        if line == HEADER_PATRIMOINE:
            line = yield
        if current_state == "new_line":
            if line[0] != "":
                row = {
                    "Etage": line[0],
                    "Lot": line[1][-2:] if line[1] != "" else row["Lot"],
                    "Type": line[2] if line[2] != "" else row["Type"],
                    "Locataire": line[3],
                    "Loyer_annuel": line[4].replace(" ", ""),
                    "Debut_bail": line[5],
                    "Fin_bail": line[6],
                    "Entree": line[7],
                    "Depart": line[8],
                    "Revision_bail": line[9],
                    "Usage": line[10],
                    "Depot_garantie": line[11].replace(" ", ""),
                }
                line = yield row
            else:
                line = yield
--- a/pdf_oralia/scripts.py
+++ b/pdf_oralia/scripts.py
@@ -4,31 +4,34 @@ from pathlib import Path
 import click
-from .extract import extract_save
+from .extract import extract_save, extract_plan
 from .join import join_excel
 logging_config = dict(
    version=1,
    formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
    handlers={
        "h": {
            "class": "logging.StreamHandler",
            "formatter": "f",
            "level": logging.DEBUG,
        }
    },
    root={
        "handlers": ["h"],
        "level": logging.DEBUG,
    },
 )
 dictConfig(logging_config)
@click.group()
-def main():
+@click.option("--debug/--no-debug", default=False)
-    pass
+def main(debug):
    if debug:
        logging_level = logging.DEBUG
    else:
        logging_level = logging.INFO
    logging_config = dict(
        version=1,
        formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
        handlers={
            "h": {
                "class": "logging.StreamHandler",
                "formatter": "f",
                "level": logging_level,
            }
        },
        root={
            "handlers": ["h"],
            "level": logging_level,
        },
    )
    dictConfig(logging_config)
@main.group()
@@ -48,18 +51,45 @@ def on(pdf_file, dest):
@extract.command()
-@click.option("--src", help="Tous les fichiers dans folder", default="./")
+@click.option(
    "--src", help="Tous les fichiers dans folder (de façon récursive)", default="./"
 )
@click.option("--dest", help="Où mettre les fichiers produits", default="./")
-def all(src, dest):
+@click.option(
-    p = Path(src)
+    "--only-plan",
    help="Ne produit rien mais indique les changements",
    default=False,
    is_flag=True,
 )
@click.option(
    "--force",
    help="Écrase les fichiers produits précédemment",
    default=False,
    is_flag=True,
 )
 def all(src, dest, force, only_plan):
    src_path = Path(src)
-    d = Path(dest)
+    dest = Path(dest)
-    d.mkdir(exist_ok=True)
+    dest.mkdir(exist_ok=True)
-    pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)]
+    for pdf_file in src_path.rglob("**/*.pdf"):
-    for pdf_file in pdf_files:
+        relative_path = pdf_file.relative_to(src_path)
        files_dest = dest / relative_path.parent
        logging.info(f"Found {pdf_file}")
-        extract_save(pdf_file, d)
+
        plan_dest = extract_plan(pdf_file, files_dest)
        save = []
        for k, p in plan_dest.items():
            if not p.exists() or force:
                save.append(k)
        if only_plan:
            for s in save:
                logging.info(f"Planing to create {plan_dest[s]}")
        else:
            files_dest.mkdir(parents=True, exist_ok=True)
            extract_save(pdf_file, files_dest, save)
@main.command()
@@ -87,6 +117,8 @@ def join(src, dest, force):
    if not force and Path(dest_locataire).exists():
        raise ValueError(f"The file {dest_locataire} already exists")
    if not Path(src).exists():
        raise ValueError(f"The source directory ({src}) does not exists.")
    join_excel(src, dest_charge, "*_charge.xlsx")
    logging.info(f"Les données charges ont été concaténées dans {dest_charge}")
    join_excel(src, dest_locataire, "*_locataire.xlsx")
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ pdf-oralia = "pdf_oralia.scripts:main"
 python = "^3.10"
 click = "^8.1.3"
 pdfplumber = "^0.7.4"
-pandas = "^1.5.0"
+pandas = "^2.2.3"
 openpyxl = "^3.0.10"
--- a/renovate.json
+++ b/renovate.json
@@ -0,0 +1,2 @@
 {
 }
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,76 +1,5 @@
-argon2-cffi==21.3.0
+pdfplumber
-argon2-cffi-bindings==21.2.0
+numpy
-asttokens==2.0.8
+pandas
-attrs==22.1.0
+click
-backcall==0.2.0
+openpyxl
 beautifulsoup4==4.11.1
 bleach==5.0.1
 cffi==1.15.1
 charset-normalizer==2.1.1
 cryptography==38.0.1
 debugpy==1.6.3
 decorator==5.1.1
 defusedxml==0.7.1
 entrypoints==0.4
 et-xmlfile==1.1.0
 executing==1.1.0
 fastjsonschema==2.16.2
 ipykernel==6.16.0
 ipython==8.5.0
 ipython-genutils==0.2.0
 ipywidgets==8.0.2
 jedi==0.18.1
 Jinja2==3.1.2
 jsonschema==4.16.0
 jupyter==1.0.0
 jupyter-console==6.4.4
 jupyter-core==4.11.1
 jupyter_client==7.3.5
 jupyterlab-pygments==0.2.2
 jupyterlab-widgets==3.0.3
 lxml==4.9.1
 MarkupSafe==2.1.1
 matplotlib-inline==0.1.6
 mistune==2.0.4
 nbclient==0.6.8
 nbconvert==7.0.0
 nbformat==5.6.1
 nest-asyncio==1.5.5
 notebook==6.4.12
 numpy==1.23.3
 openpyxl==3.0.10
 packaging==21.3
 pandas==1.5.0
 pandocfilters==1.5.0
 parso==0.8.3
 pdfminer.six==20220524
 pdfplumber==0.7.4
 pexpect==4.8.0
 pickleshare==0.7.5
 Pillow==9.2.0
 prometheus-client==0.14.1
 prompt-toolkit==3.0.31
 psutil==5.9.2
 ptyprocess==0.7.0
 pure-eval==0.2.2
 pycparser==2.21
 Pygments==2.13.0
 pyparsing==3.0.9
 pyrsistent==0.18.1
 python-dateutil==2.8.2
 pytz==2022.2.1
 pyzmq==24.0.1
 qtconsole==5.3.2
 QtPy==2.2.0
 Send2Trash==1.8.0
 six==1.16.0
 soupsieve==2.3.2.post1
 stack-data==0.5.1
 terminado==0.15.0
 tinycss2==1.1.1
 tornado==6.2
 traitlets==5.4.0
 Wand==0.6.10
 wcwidth==0.2.5
 webencodings==0.5.1
 widgetsnbextension==4.0.3
Author	SHA1	Message	Date
Bertrand Benjamin	092b925b68	doc: add utilisation in readme	2025-02-26 09:08:17 +01:00
Bertrand Benjamin	3c18bd5d81	Feat: script command all works recursively	2025-02-26 09:02:33 +01:00
Bertrand Benjamin	4ee78a7e7b	Feat: specify page type before extracting it	2025-02-26 05:58:38 +01:00
Bertrand Benjamin	ce8cdc4c1e	Feat: use fsm to extract lines from pdf	2025-02-26 05:54:44 +01:00
Bertrand Benjamin	6e0ffe9085	core: change pandas version	2024-10-16 06:47:55 +02:00
Bertrand Benjamin	ab2fdb0541	Feat: make from_pdf importable and move plumber in it	2024-10-16 06:47:25 +02:00
Bertrand Benjamin	0fc39ed317	Merge pull request 'Update dependency MarkupSafe to v2.1.5' (#17 ) from renovate/markupsafe-2.x into main Reviewed-on: #17	2024-02-17 05:08:19 +00:00
Bertrand Benjamin	a6d6681756	Merge branch 'main' into renovate/markupsafe-2.x	2024-02-17 05:08:08 +00:00
Bertrand Benjamin	4eecb3a44c	Merge pull request 'Update dependency Jinja2 to v3.1.3' (#16 ) from renovate/jinja2-3.x into main Reviewed-on: #16	2024-02-17 05:07:48 +00:00
Renovate Bot	60da623323	Update dependency MarkupSafe to v2.1.5	2024-02-17 05:04:52 +00:00
Renovate Bot	1f1e3e2741	Update dependency Jinja2 to v3.1.3	2024-02-17 05:04:48 +00:00
Bertrand Benjamin	2b3e935f39	Merge pull request 'Update dependency Send2Trash to v1.8.2' (#15 ) from renovate/send2trash-1.x into main Reviewed-on: #15	2024-02-17 04:56:16 +00:00
Bertrand Benjamin	ef63f22d44	Merge pull request 'Update dependency MarkupSafe to v2.1.3' (#14 ) from renovate/markupsafe-2.x into main Reviewed-on: #14	2024-02-17 04:55:55 +00:00
Renovate Bot	1020ef9257	Update dependency Send2Trash to v1.8.2	2024-01-10 11:04:32 +00:00
Renovate Bot	39084ceebd	Update dependency MarkupSafe to v2.1.3	2024-01-10 11:04:30 +00:00
Bertrand Benjamin	7de6c8dd9c	clean renovate.json	2024-01-10 10:46:45 +00:00
Bertrand Benjamin	da3815eea6	activate renovate	2024-01-09 06:53:09 +00:00
Bertrand Benjamin	45d343d810	Feat: add raise error when src does not exists	2024-01-02 22:22:58 +01:00
Bertrand Benjamin	806227f202	Feat: add logging in join	2023-12-30 17:45:15 +01:00
Bertrand Benjamin	7bf0c38883	Feat: add option for debugging	2023-12-30 17:25:40 +01:00
Bertrand Benjamin	b15b059e2a	Add debug	2023-12-27 19:58:12 +01:00