4 changed files with 0 additions and 51 deletions
--- a/.gitignore
+++ b/.gitignore
@ -159,5 +159,3 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 *.duckdb
--- a/14
+++ b/14
@ -42,18 +42,6 @@ clean_built:
 	rm -rf $(DATA_BASE)/staging/**/*.csv
 	rm -rf $(DATA_BASE)/gold/**/*.csv
 	rm -rf $(DATA_BASE)/datamart/**/*.csv
 	rm -rf $(DATA_BASE)/datamart/**/*.xlsx
 run_ingest:
 	python -m scripts ingest
 run_feature:
 	python -m scripts feature
 run_datamart:
 	python -m scripts datamart
 build: clean_built run_ingest run_feature run_datamart
 clean_all: clean_built clean_raw
@ -62,5 +50,3 @@ import_nextcloud:
 push_nextcloud:
 	rsync -av ./datas/datamart/ ~/Nextcloud/PLESNA\ Compta\ SYSTEM/DataMart
--- a/dlt/pdf_pipeline.py
+++ b/dlt/pdf_pipeline.py
@ -1,33 +0,0 @@
 import dlt
 from pathlib import Path
 from pdf_oralia.extract import from_pdf
 import pdfplumber
 DATA_PATH = Path("datas/")
 assert DATA_PATH.exists()
 RAW_CRG_PDF = DATA_PATH / "pdfs"
 assert RAW_CRG_PDF.exists()
@dlt.resource(name="crg")
 def crg_pdf(filename):
    print(filename)
    pdf = pdfplumber.open(filename)
    try:
        df_charge, df_loc = from_pdf(pdf)
    except ValueError as e:
        print(f"\tExtract Error: {e}")
        pass
    else:
        for row in df_charge.to_dict("records"):
            yield row
 if __name__ == "__main__":
    pipeline = dlt.pipeline(
        pipeline_name='raw', destination="duckdb", dataset_name="crg"
    )
    for pdf_file in RAW_CRG_PDF.glob("**/*.pdf"):
        load_info = pipeline.run(crg_pdf(pdf_file), table_name='charge')
        print(load_info)
--- a/requirements.txt
+++ b/requirements.txt
@ -3,5 +3,3 @@ pandas==1.5.0
 pdf-oralia==0.3.11
 pydantic==2.6.1
 click==8.1.7
 dlt[duckdb]>=0.4.3a0
 openpyxl>=3.0.0