Compare commits

3 Commits

Author SHA1 Message Date
4249e902b2 Feat: add *.duckdb in gitignore 2024-06-18 06:30:27 +02:00
ab36931c06 Feat: create pdf ingest pipeline 2024-06-18 06:26:51 +02:00
1ed6ed43ed Feat: add commands in makefile 2024-06-11 17:54:25 +02:00
4 changed files with 51 additions and 0 deletions

2
.gitignore vendored
View File

@@ -159,3 +159,5 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
*.duckdb

View File

@@ -42,6 +42,18 @@ clean_built:
rm -rf $(DATA_BASE)/staging/**/*.csv
rm -rf $(DATA_BASE)/gold/**/*.csv
rm -rf $(DATA_BASE)/datamart/**/*.csv
rm -rf $(DATA_BASE)/datamart/**/*.xlsx
run_ingest:
python -m scripts ingest
run_feature:
python -m scripts feature
run_datamart:
python -m scripts datamart
build: clean_built run_ingest run_feature run_datamart
clean_all: clean_built clean_raw
@@ -50,3 +62,5 @@ import_nextcloud:
push_nextcloud:
rsync -av ./datas/datamart/ ~/Nextcloud/PLESNA\ Compta\ SYSTEM/DataMart

33
dlt/pdf_pipeline.py Normal file
View File

@@ -0,0 +1,33 @@
import dlt
from pathlib import Path
from pdf_oralia.extract import from_pdf
import pdfplumber
DATA_PATH = Path("datas/")
assert DATA_PATH.exists()
RAW_CRG_PDF = DATA_PATH / "pdfs"
assert RAW_CRG_PDF.exists()
@dlt.resource(name="crg")
def crg_pdf(filename):
print(filename)
pdf = pdfplumber.open(filename)
try:
df_charge, df_loc = from_pdf(pdf)
except ValueError as e:
print(f"\tExtract Error: {e}")
pass
else:
for row in df_charge.to_dict("records"):
yield row
if __name__ == "__main__":
pipeline = dlt.pipeline(
pipeline_name='raw', destination="duckdb", dataset_name="crg"
)
for pdf_file in RAW_CRG_PDF.glob("**/*.pdf"):
load_info = pipeline.run(crg_pdf(pdf_file), table_name='charge')
print(load_info)

View File

@@ -3,3 +3,5 @@ pandas==1.5.0
pdf-oralia==0.3.11
pydantic==2.6.1
click==8.1.7
dlt[duckdb]>=0.4.3a0
openpyxl>=3.0.0