Feat: add *.duckdb in gitignore

Feat: create pdf ingest pipeline
Feat: add commands in makefile
2024-06-18 06:30:27 +02:00 · 2024-06-18 06:26:51 +02:00 · 2024-06-11 17:54:25 +02:00
4 changed files with 51 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -159,3 +159,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+*.duckdb
--- a/14
+++ b/14
@@ -42,6 +42,18 @@ clean_built:
 	rm -rf $(DATA_BASE)/staging/**/*.csv
 	rm -rf $(DATA_BASE)/gold/**/*.csv
 	rm -rf $(DATA_BASE)/datamart/**/*.csv
+	rm -rf $(DATA_BASE)/datamart/**/*.xlsx
+
+run_ingest:
+	python -m scripts ingest
+
+run_feature:
+	python -m scripts feature
+
+run_datamart:
+	python -m scripts datamart
+
+build: clean_built run_ingest run_feature run_datamart

 clean_all: clean_built clean_raw

@@ -50,3 +62,5 @@ import_nextcloud:

 push_nextcloud:
 	rsync -av ./datas/datamart/ ~/Nextcloud/PLESNA\ Compta\ SYSTEM/DataMart
+
+
--- a/dlt/pdf_pipeline.py
+++ b/dlt/pdf_pipeline.py
@@ -0,0 +1,33 @@
+import dlt
+from pathlib import Path
+from pdf_oralia.extract import from_pdf
+import pdfplumber
+
+DATA_PATH = Path("datas/")
+assert DATA_PATH.exists()
+RAW_CRG_PDF = DATA_PATH / "pdfs"
+assert RAW_CRG_PDF.exists()
+
+
+@dlt.resource(name="crg")
+def crg_pdf(filename):
+    print(filename)
+    pdf = pdfplumber.open(filename)
+    try:
+        df_charge, df_loc = from_pdf(pdf)
+    except ValueError as e:
+        print(f"\tExtract Error: {e}")
+        pass
+    else:
+        for row in df_charge.to_dict("records"):
+            yield row
+
+
+if __name__ == "__main__":
+    pipeline = dlt.pipeline(
+        pipeline_name='raw', destination="duckdb", dataset_name="crg"
+    )
+
+    for pdf_file in RAW_CRG_PDF.glob("**/*.pdf"):
+        load_info = pipeline.run(crg_pdf(pdf_file), table_name='charge')
+        print(load_info)
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,5 @@ pandas==1.5.0
 pdf-oralia==0.3.11
 pydantic==2.6.1
 click==8.1.7
+dlt[duckdb]>=0.4.3a0
+openpyxl>=3.0.0
Author	SHA1	Message	Date
Bertrand Benjamin	4249e902b2	Feat: add *.duckdb in gitignore	2024-06-18 06:30:27 +02:00
Bertrand Benjamin	ab36931c06	Feat: create pdf ingest pipeline	2024-06-18 06:26:51 +02:00
Bertrand Benjamin	1ed6ed43ed	Feat: add commands in makefile	2024-06-11 17:54:25 +02:00