import dlt from pathlib import Path from pdf_oralia.extract import from_pdf import pdfplumber DATA_PATH = Path("datas/") assert DATA_PATH.exists() RAW_CRG_PDF = DATA_PATH / "pdfs" assert RAW_CRG_PDF.exists() @dlt.resource(name="crg") def crg_pdf(filename): print(filename) pdf = pdfplumber.open(filename) try: df_charge, df_loc = from_pdf(pdf) except ValueError as e: print(f"\tExtract Error: {e}") pass else: for row in df_charge.to_dict("records"): yield row if __name__ == "__main__": pipeline = dlt.pipeline( pipeline_name='raw', destination="duckdb", dataset_name="crg" ) for pdf_file in RAW_CRG_PDF.glob("**/*.pdf"): load_info = pipeline.run(crg_pdf(pdf_file), table_name='charge') print(load_info)