Feat: add *.duckdb in gitignore

Feat: create pdf ingest pipeline
Feat: add commands in makefile
2024-06-18 06:30:27 +02:00 · 2024-06-18 06:26:51 +02:00 · 2024-06-11 17:54:25 +02:00 · 2024-04-15 11:59:45 +02:00 · 2024-04-15 11:59:32 +02:00
8 changed files with 125 additions and 32 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -159,3 +159,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 *.duckdb
--- a/14
+++ b/14
@@ -42,6 +42,18 @@ clean_built:
 	rm -rf $(DATA_BASE)/staging/**/*.csv
 	rm -rf $(DATA_BASE)/gold/**/*.csv
 	rm -rf $(DATA_BASE)/datamart/**/*.csv
 	rm -rf $(DATA_BASE)/datamart/**/*.xlsx
 run_ingest:
 	python -m scripts ingest
 run_feature:
 	python -m scripts feature
 run_datamart:
 	python -m scripts datamart
 build: clean_built run_ingest run_feature run_datamart
 clean_all: clean_built clean_raw
@@ -50,3 +62,5 @@ import_nextcloud:
 push_nextcloud:
 	rsync -av ./datas/datamart/ ~/Nextcloud/PLESNA\ Compta\ SYSTEM/DataMart
--- a/dlt/pdf_pipeline.py
+++ b/dlt/pdf_pipeline.py
@@ -0,0 +1,33 @@
 import dlt
 from pathlib import Path
 from pdf_oralia.extract import from_pdf
 import pdfplumber
 DATA_PATH = Path("datas/")
 assert DATA_PATH.exists()
 RAW_CRG_PDF = DATA_PATH / "pdfs"
 assert RAW_CRG_PDF.exists()
@dlt.resource(name="crg")
 def crg_pdf(filename):
    print(filename)
    pdf = pdfplumber.open(filename)
    try:
        df_charge, df_loc = from_pdf(pdf)
    except ValueError as e:
        print(f"\tExtract Error: {e}")
        pass
    else:
        for row in df_charge.to_dict("records"):
            yield row
 if __name__ == "__main__":
    pipeline = dlt.pipeline(
        pipeline_name='raw', destination="duckdb", dataset_name="crg"
    )
    for pdf_file in RAW_CRG_PDF.glob("**/*.pdf"):
        load_info = pipeline.run(crg_pdf(pdf_file), table_name='charge')
        print(load_info)
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,5 @@ pandas==1.5.0
 pdf-oralia==0.3.11
 pydantic==2.6.1
 click==8.1.7
 dlt[duckdb]>=0.4.3a0
 openpyxl>=3.0.0
--- a/scripts/main.py
+++ b/scripts/main.py
@@ -79,7 +79,11 @@ def feature():
 def datamart():
    from .gold_mart import FLUXES_LOT
-    consume_fluxes(fluxes=FLUXES_LOT, origin_path=GOLD_PATH, dest_path=MART_PATH)
+    consume_fluxes(
        fluxes=FLUXES_LOT,
        origin_path=GOLD_PATH,
        dest_path=MART_PATH,
    )
 if __name__ == "__main__":
--- a/scripts/flux.py
+++ b/scripts/flux.py
@@ -4,7 +4,7 @@ from collections.abc import Callable
 from pathlib import Path
 import pandas as pd
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 class Source(BaseModel):
@@ -38,21 +38,51 @@ class Transformation(BaseModel):
    extra_kwrds: dict = {}
 def to_csv(df, dest_basename: Path) -> Path:
    dest = dest_basename.parent / (dest_basename.stem + ".csv")
    if dest.exists():
        df.to_csv(dest, mode="a", header=False, index=False)
    else:
        df.to_csv(dest, index=False)
    return dest
 def to_excel(df, dest_basename: Path) -> Path:
    dest = dest_basename.parent / (dest_basename.stem + ".xlsx")
    if dest.exists():
        raise ValueError(f"The destination exits {dest}")
    else:
        df.to_excel(dest)
    return dest
 class Destination(BaseModel):
    name: str
    writer: Callable = Field(to_csv)
    def _write(
        self,
        df: pd.DataFrame,
        dest_basename: Path,
        writing_func: Callable | None = None,
    ) -> Path:
        if writing_func is None:
            writing_func = self.writer
        return writing_func(df, dest_basename)
    def write(
-        self, df: pd.DataFrame, dest_path: Path, writing_func: Callable
+        self, df: pd.DataFrame, dest_path: Path, writing_func: Callable | None = None
    ) -> list[Path]:
        dest_basename = dest_path / self.name
-        return [writing_func(df, dest_basename)]
+        return [self._write(df, dest_basename, writing_func)]
 class SplitDestination(Destination):
    split_column: str
    def write(
-        self, df: pd.DataFrame, dest_path: Path, writing_func: Callable
+        self, df: pd.DataFrame, dest_path: Path, writing_func: Callable | None = None
    ) -> list[Path]:
        wrote_files = []
@@ -60,7 +90,7 @@ class SplitDestination(Destination):
            filtered_df = df[df[self.split_column] == col_value]
            dest_basename = dest_path / f"{self.name}-{col_value}"
-            dest = writing_func(filtered_df, dest_basename)
+            dest = self._write(filtered_df, dest_basename, writing_func)
            wrote_files.append(dest)
        return wrote_files
@@ -72,15 +102,6 @@ class Flux(BaseModel):
    destination: Destination
 def to_csv(df, dest_basename: Path) -> Path:
    dest = dest_basename.parent / (dest_basename.stem + ".csv")
    if dest.exists():
        df.to_csv(dest, mode="a", header=False, index=False)
    else:
        df.to_csv(dest, index=False)
    return dest
 def write_split_by(
    df: pd.DataFrame, column: str, dest_path: Path, name: str, writing_func
 ) -> list[Path]:
@@ -119,16 +140,13 @@ def split_duplicates(
    return no_duplicates, duplicated
-def consume_fluxes(
+def consume_flux(
-    fluxes: dict[str, Flux],
+    name: str,
    flux: Flux,
    origin_path: Path,
    dest_path: Path,
-    writing_func=to_csv,
+    duplicated={},
 ):
    duplicated = {}
    wrote_files = []
    for name, flux in fluxes.items():
    logging.info(f"Consume {name}")
    src_df = []
    for filename, df in extract_sources(flux.sources, origin_path):
@@ -138,7 +156,22 @@ def consume_fluxes(
    logging.info(f"Execute {flux.transformation.function.__name__}")
    df = flux.transformation.function(src_df, **flux.transformation.extra_kwrds)
-        files = flux.destination.write(df, dest_path, writing_func)
+
    files = flux.destination.write(df, dest_path)
    logging.info(f"{files} written")
    return files
 def consume_fluxes(
    fluxes: dict[str, Flux],
    origin_path: Path,
    dest_path: Path,
 ):
    duplicated = {}
    wrote_files = []
    for name, flux in fluxes.items():
        files = consume_flux(name, flux, origin_path, dest_path, duplicated)
        wrote_files += files
    return wrote_files
--- a/scripts/gold_mart.py
+++ b/scripts/gold_mart.py
@@ -11,6 +11,7 @@ from scripts.flux import (
    SplitDestination,
    Transformation,
    consume_fluxes,
    to_excel,
 )
 logger = logging.getLogger(__name__)
@@ -26,7 +27,9 @@ FLUXES_LOT = {
    "Lots": Flux(
        sources=[CSVSource(filename="CRG/crg-*.csv")],
        transformation=Transformation(function=build_lots),
-        destination=SplitDestination(name="Lot/lot", split_column="Lot"),
+        destination=SplitDestination(
            name="Lot/lot", split_column="Lot", writer=to_excel
        ),
    ),
 }
@@ -75,6 +78,8 @@ if __name__ == "__main__":
    pnl_fluxes = {}
    files = consume_fluxes(
-        fluxes=pnl_fluxes, origin_path=gold_path, dest_path=mart_path
+        fluxes=pnl_fluxes,
        origin_path=gold_path,
        dest_path=mart_path,
    )
    print(files)
--- a/scripts/history_stagging.py
+++ b/scripts/history_stagging.py
@@ -12,9 +12,9 @@ logger.setLevel(logging.DEBUG)
 def extract_cat(cat: pd.DataFrame):
-    cat_drop = list(cat[cat["Nouvelles"] == "NE PAS IMPORTER"]["Anciennes"])
+    cat_drop = list(cat[cat["Nouvelles"] == "NE PAS IMPORTER"])
    # cat_drop = list(cat[cat["Nouvelles"] == "NE PAS IMPORTER"]["Anciennes"])
    cat_trans = cat[cat["Nouvelles"] != "NE PAS IMPORTER"]
    trans = {}
    for _, (old, new) in cat_trans.iterrows():
        trans[old] = new
@@ -140,7 +140,7 @@ FLUXES_CRG = {
    ),
    "2022 - charge.xlsx": Flux(
        sources=[
-            ExcelSource(filename="2022 - charge.xlsx", sheet_name="Sheet1"),
+            ExcelSource(filename="2022 - charge.xlsx", sheet_name="DB CRG"),
        ],
        transformation=Transformation(
            function=trans_2022_charge,
Author	SHA1	Message	Date
Bertrand Benjamin	4249e902b2	Feat: add *.duckdb in gitignore	2024-06-18 06:30:27 +02:00
Bertrand Benjamin	ab36931c06	Feat: create pdf ingest pipeline	2024-06-18 06:26:51 +02:00
Bertrand Benjamin	1ed6ed43ed	Feat: add commands in makefile	2024-06-11 17:54:25 +02:00
Bertrand Benjamin	215e26b84f	Feat: adapt to new excel format	2024-04-15 11:59:45 +02:00
Bertrand Benjamin	b60fa3be17	Feat: add excel export for mart	2024-04-15 11:59:32 +02:00