34 lines
811 B
Python
34 lines
811 B
Python
|
import dlt
|
||
|
from pathlib import Path
|
||
|
from pdf_oralia.extract import from_pdf
|
||
|
import pdfplumber
|
||
|
|
||
|
DATA_PATH = Path("datas/")
|
||
|
assert DATA_PATH.exists()
|
||
|
RAW_CRG_PDF = DATA_PATH / "pdfs"
|
||
|
assert RAW_CRG_PDF.exists()
|
||
|
|
||
|
|
||
|
@dlt.resource(name="crg")
|
||
|
def crg_pdf(filename):
|
||
|
print(filename)
|
||
|
pdf = pdfplumber.open(filename)
|
||
|
try:
|
||
|
df_charge, df_loc = from_pdf(pdf)
|
||
|
except ValueError as e:
|
||
|
print(f"\tExtract Error: {e}")
|
||
|
pass
|
||
|
else:
|
||
|
for row in df_charge.to_dict("records"):
|
||
|
yield row
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
pipeline = dlt.pipeline(
|
||
|
pipeline_name='raw', destination="duckdb", dataset_name="crg"
|
||
|
)
|
||
|
|
||
|
for pdf_file in RAW_CRG_PDF.glob("**/*.pdf"):
|
||
|
load_info = pipeline.run(crg_pdf(pdf_file), table_name='charge')
|
||
|
print(load_info)
|