From ceebfb0a3852a56b065b717344a972d2285330b9 Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Tue, 27 Jun 2023 10:23:02 +0200 Subject: [PATCH] Feat: better extraction of lot --- pdf_oralia/pages/charge.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pdf_oralia/pages/charge.py b/pdf_oralia/pages/charge.py index a6b6abf..d10c4b6 100644 --- a/pdf_oralia/pages/charge.py +++ b/pdf_oralia/pages/charge.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pandas as pd @@ -13,15 +15,13 @@ def is_it(page_text): return False -def get_lot(x): +def get_lot(txt): """Return lot number from "RECAPITULATIF DES OPERATIONS" """ - if x[:2].isdigit(): - return x[:2] - if x[:1].isdigit(): - return "0" + x[:1] - if x[:2] == "PC": - return "PC" - return "" + regex = r"[BSM](\d+)\s-" + result = re.findall(regex, txt) + if result: + return "{:02d}".format(int(result[0])) + return "*" def keep_row(row):