Feat: create pdf ingest pipeline

2024-06-18 06:26:51 +02:00
parent 1ed6ed43ed
commit ab36931c06
1 changed files with 33 additions and 0 deletions
--- a/dlt/pdf_pipeline.py
+++ b/dlt/pdf_pipeline.py
@@ -0,0 +1,33 @@
 import dlt
 from pathlib import Path
 from pdf_oralia.extract import from_pdf
 import pdfplumber
 DATA_PATH = Path("datas/")
 assert DATA_PATH.exists()
 RAW_CRG_PDF = DATA_PATH / "pdfs"
 assert RAW_CRG_PDF.exists()
@dlt.resource(name="crg")
 def crg_pdf(filename):
    print(filename)
    pdf = pdfplumber.open(filename)
    try:
        df_charge, df_loc = from_pdf(pdf)
    except ValueError as e:
        print(f"\tExtract Error: {e}")
        pass
    else:
        for row in df_charge.to_dict("records"):
            yield row
 if __name__ == "__main__":
    pipeline = dlt.pipeline(
        pipeline_name='raw', destination="duckdb", dataset_name="crg"
    )
    for pdf_file in RAW_CRG_PDF.glob("**/*.pdf"):
        load_info = pipeline.run(crg_pdf(pdf_file), table_name='charge')
        print(load_info)