From ab36931c068f568f3d7327299fe7b83669e22695 Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Tue, 18 Jun 2024 06:26:51 +0200 Subject: [PATCH] Feat: create pdf ingest pipeline --- dlt/pdf_pipeline.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 dlt/pdf_pipeline.py diff --git a/dlt/pdf_pipeline.py b/dlt/pdf_pipeline.py new file mode 100644 index 0000000..3e7d7ec --- /dev/null +++ b/dlt/pdf_pipeline.py @@ -0,0 +1,33 @@ +import dlt +from pathlib import Path +from pdf_oralia.extract import from_pdf +import pdfplumber + +DATA_PATH = Path("datas/") +assert DATA_PATH.exists() +RAW_CRG_PDF = DATA_PATH / "pdfs" +assert RAW_CRG_PDF.exists() + + +@dlt.resource(name="crg") +def crg_pdf(filename): + print(filename) + pdf = pdfplumber.open(filename) + try: + df_charge, df_loc = from_pdf(pdf) + except ValueError as e: + print(f"\tExtract Error: {e}") + pass + else: + for row in df_charge.to_dict("records"): + yield row + + +if __name__ == "__main__": + pipeline = dlt.pipeline( + pipeline_name='raw', destination="duckdb", dataset_name="crg" + ) + + for pdf_file in RAW_CRG_PDF.glob("**/*.pdf"): + load_info = pipeline.run(crg_pdf(pdf_file), table_name='charge') + print(load_info)