Compare commits

..

No commits in common. "dbt-dlt" and "main" have entirely different histories.

2 changed files with 0 additions and 35 deletions

2
.gitignore vendored
View File

@ -159,5 +159,3 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear # and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder. # option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/ #.idea/
*.duckdb

View File

@ -1,33 +0,0 @@
import dlt
from pathlib import Path
from pdf_oralia.extract import from_pdf
import pdfplumber
DATA_PATH = Path("datas/")
assert DATA_PATH.exists()
RAW_CRG_PDF = DATA_PATH / "pdfs"
assert RAW_CRG_PDF.exists()
@dlt.resource(name="crg")
def crg_pdf(filename):
print(filename)
pdf = pdfplumber.open(filename)
try:
df_charge, df_loc = from_pdf(pdf)
except ValueError as e:
print(f"\tExtract Error: {e}")
pass
else:
for row in df_charge.to_dict("records"):
yield row
if __name__ == "__main__":
pipeline = dlt.pipeline(
pipeline_name='raw', destination="duckdb", dataset_name="crg"
)
for pdf_file in RAW_CRG_PDF.glob("**/*.pdf"):
load_info = pipeline.run(crg_pdf(pdf_file), table_name='charge')
print(load_info)