From 4ee78a7e7b0c433fb5209524c8292efb478a0494 Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Wed, 26 Feb 2025 05:58:38 +0100 Subject: [PATCH] Feat: specify page type before extracting it --- pdf_oralia/extract.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pdf_oralia/extract.py b/pdf_oralia/extract.py index 238fe4d..3ad05ac 100644 --- a/pdf_oralia/extract.py +++ b/pdf_oralia/extract.py @@ -50,26 +50,33 @@ def pdf_extract_tables_lines(pdf): "mois": date.strftime("%m"), "annee": date.strftime("%Y"), } + table_type = "" + if locataire.is_it(page_text): + table_type = "locataire" + elif charge.is_it(page_text): + table_type = "charge" + elif patrimoine.is_it(page_text): + table_type = "patrimoine" + else: + logging.warning(f"Page {page_number} non reconnu. Page ignorée.") for line in page.extract_table(extract_table_settings): - if locataire.is_it(page_text): + if table_type == "locataire": res = loc_sink.send(line) if res: res.update(additionnal_fields) yield locataire.Line(**res) - elif charge.is_it(page_text): + elif table_type == "charge": res = charge_sink.send(line) if res: res.update(additionnal_fields) yield charge.Line(**res) - elif patrimoine.is_it(page_text): + elif table_type == "patrimoine": res = patrimoine_sink.send(line) if res: res.update(additionnal_fields) yield patrimoine.Line(**res) - else: - logging.warning(f"Page {page_number} non reconnu. Page ignorée.") page_number += 1