Compare commits
No commits in common. "4249e902b2d7da74b4aa2868121a8a01d510f62b" and "215e26b84fcf270d2c47c121cdb41ba84f0ba232" have entirely different histories.
4249e902b2
...
215e26b84f
2
.gitignore
vendored
2
.gitignore
vendored
@ -159,5 +159,3 @@ cython_debug/
|
|||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
*.duckdb
|
|
||||||
|
14
Makefile
14
Makefile
@ -42,18 +42,6 @@ clean_built:
|
|||||||
rm -rf $(DATA_BASE)/staging/**/*.csv
|
rm -rf $(DATA_BASE)/staging/**/*.csv
|
||||||
rm -rf $(DATA_BASE)/gold/**/*.csv
|
rm -rf $(DATA_BASE)/gold/**/*.csv
|
||||||
rm -rf $(DATA_BASE)/datamart/**/*.csv
|
rm -rf $(DATA_BASE)/datamart/**/*.csv
|
||||||
rm -rf $(DATA_BASE)/datamart/**/*.xlsx
|
|
||||||
|
|
||||||
run_ingest:
|
|
||||||
python -m scripts ingest
|
|
||||||
|
|
||||||
run_feature:
|
|
||||||
python -m scripts feature
|
|
||||||
|
|
||||||
run_datamart:
|
|
||||||
python -m scripts datamart
|
|
||||||
|
|
||||||
build: clean_built run_ingest run_feature run_datamart
|
|
||||||
|
|
||||||
clean_all: clean_built clean_raw
|
clean_all: clean_built clean_raw
|
||||||
|
|
||||||
@ -62,5 +50,3 @@ import_nextcloud:
|
|||||||
|
|
||||||
push_nextcloud:
|
push_nextcloud:
|
||||||
rsync -av ./datas/datamart/ ~/Nextcloud/PLESNA\ Compta\ SYSTEM/DataMart
|
rsync -av ./datas/datamart/ ~/Nextcloud/PLESNA\ Compta\ SYSTEM/DataMart
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,33 +0,0 @@
|
|||||||
import dlt
|
|
||||||
from pathlib import Path
|
|
||||||
from pdf_oralia.extract import from_pdf
|
|
||||||
import pdfplumber
|
|
||||||
|
|
||||||
DATA_PATH = Path("datas/")
|
|
||||||
assert DATA_PATH.exists()
|
|
||||||
RAW_CRG_PDF = DATA_PATH / "pdfs"
|
|
||||||
assert RAW_CRG_PDF.exists()
|
|
||||||
|
|
||||||
|
|
||||||
@dlt.resource(name="crg")
|
|
||||||
def crg_pdf(filename):
|
|
||||||
print(filename)
|
|
||||||
pdf = pdfplumber.open(filename)
|
|
||||||
try:
|
|
||||||
df_charge, df_loc = from_pdf(pdf)
|
|
||||||
except ValueError as e:
|
|
||||||
print(f"\tExtract Error: {e}")
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
for row in df_charge.to_dict("records"):
|
|
||||||
yield row
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pipeline = dlt.pipeline(
|
|
||||||
pipeline_name='raw', destination="duckdb", dataset_name="crg"
|
|
||||||
)
|
|
||||||
|
|
||||||
for pdf_file in RAW_CRG_PDF.glob("**/*.pdf"):
|
|
||||||
load_info = pipeline.run(crg_pdf(pdf_file), table_name='charge')
|
|
||||||
print(load_info)
|
|
@ -3,5 +3,3 @@ pandas==1.5.0
|
|||||||
pdf-oralia==0.3.11
|
pdf-oralia==0.3.11
|
||||||
pydantic==2.6.1
|
pydantic==2.6.1
|
||||||
click==8.1.7
|
click==8.1.7
|
||||||
dlt[duckdb]>=0.4.3a0
|
|
||||||
openpyxl>=3.0.0
|
|
||||||
|
Loading…
Reference in New Issue
Block a user