From bde6a0dfc66094d437c65e869d5844201af4c0b7 Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Sun, 18 Dec 2022 10:01:19 +0100 Subject: [PATCH] Feat: extract date from top of page --- pdf_oralia/extract.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pdf_oralia/extract.py b/pdf_oralia/extract.py index fd69757..98ba3a0 100644 --- a/pdf_oralia/extract.py +++ b/pdf_oralia/extract.py @@ -1,4 +1,5 @@ import logging +from datetime import datetime from pathlib import Path import pandas as pd @@ -13,6 +14,19 @@ charge_table_settings = { } +def extract_date(page_text): + """Extract date from a page + + :param page_text: text in the page + :return: the extracted date + """ + blocs = page_text.split("\n") + for b in blocs: + if "Lyon le" in b: + words = b.split(" ") + return datetime.strptime(words[-1], "%d/%m/%Y") + + def extract_from_pdf(pdf, charge_dest, location_dest): """Build charge_dest and location_dest xlsx file from pdf""" loc_tables = [] @@ -27,8 +41,11 @@ def extract_from_pdf(pdf, charge_dest, location_dest): situation_loc_line = [ l for l in page_text.split("\n") if "SITUATION DES LOCATAIRES" in l ] + date = extract_date(page_text) + mois = date.strftime("%m") + annee = date.strftime("%Y") if situation_loc_line: - mois, annee = situation_loc_line[0].split(" ")[-2:] + # mois, annee = situation_loc_line[0].split(" ")[-2:] if loc_tables: loc_tables.append(page.extract_table()[1:]) else: