From b0333cddd8481d8f3742935003fd884dc78e3443 Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Wed, 20 Sep 2023 09:22:50 +0200 Subject: [PATCH 1/2] fix: raise a warning when a page is not recognized --- pdf_oralia/extract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pdf_oralia/extract.py b/pdf_oralia/extract.py index 3407f56..f7e2dba 100644 --- a/pdf_oralia/extract.py +++ b/pdf_oralia/extract.py @@ -45,7 +45,7 @@ def from_pdf(pdf): charge_tables = [] patrimoie_tables = [] - for page in pdf.pages: + for page_number, page in enumerate(pdf.pages): page_text = page.extract_text() date = extract_date(page_text) additionnal_fields = { @@ -76,7 +76,7 @@ def from_pdf(pdf): pass else: - raise ValueError("Page non reconnu") + logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.") df_charge = charge.table2df(recapitulatif_tables + charge_tables) df_loc = locataire.table2df(loc_tables) -- 2.45.2 From 0040dccd9a902d70277019433648b8b247f92e31 Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Wed, 20 Sep 2023 09:28:57 +0200 Subject: [PATCH 2/2] Feat: Handle get_lot when RECAPITULATIF is nan --- pdf_oralia/pages/charge.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pdf_oralia/pages/charge.py b/pdf_oralia/pages/charge.py index 190e5ff..4ebb6bd 100644 --- a/pdf_oralia/pages/charge.py +++ b/pdf_oralia/pages/charge.py @@ -32,7 +32,10 @@ def is_it(page_text): def get_lot(txt): """Return lot number from "RECAPITULATIF DES OPERATIONS" """ regex = r"[BSM](\d+)(?=\s*-)" - result = re.findall(regex, txt) + try: + result = re.findall(regex, txt) + except TypeError: + return "*" if result: return "{:02d}".format(int(result[0])) return "*" -- 2.45.2