diff --git a/pdf_oralia/extract.py b/pdf_oralia/extract.py index 3407f56..f7e2dba 100644 --- a/pdf_oralia/extract.py +++ b/pdf_oralia/extract.py @@ -45,7 +45,7 @@ def from_pdf(pdf): charge_tables = [] patrimoie_tables = [] - for page in pdf.pages: + for page_number, page in enumerate(pdf.pages): page_text = page.extract_text() date = extract_date(page_text) additionnal_fields = { @@ -76,7 +76,7 @@ def from_pdf(pdf): pass else: - raise ValueError("Page non reconnu") + logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.") df_charge = charge.table2df(recapitulatif_tables + charge_tables) df_loc = locataire.table2df(loc_tables) diff --git a/pdf_oralia/pages/charge.py b/pdf_oralia/pages/charge.py index 190e5ff..4ebb6bd 100644 --- a/pdf_oralia/pages/charge.py +++ b/pdf_oralia/pages/charge.py @@ -32,7 +32,10 @@ def is_it(page_text): def get_lot(txt): """Return lot number from "RECAPITULATIF DES OPERATIONS" """ regex = r"[BSM](\d+)(?=\s*-)" - result = re.findall(regex, txt) + try: + result = re.findall(regex, txt) + except TypeError: + return "*" if result: return "{:02d}".format(int(result[0])) return "*"