diff --git a/pdf_oralia/extract.py b/pdf_oralia/extract.py index fd69757..98ba3a0 100644 --- a/pdf_oralia/extract.py +++ b/pdf_oralia/extract.py @@ -1,4 +1,5 @@ import logging +from datetime import datetime from pathlib import Path import pandas as pd @@ -13,6 +14,19 @@ charge_table_settings = { } +def extract_date(page_text): + """Extract date from a page + + :param page_text: text in the page + :return: the extracted date + """ + blocs = page_text.split("\n") + for b in blocs: + if "Lyon le" in b: + words = b.split(" ") + return datetime.strptime(words[-1], "%d/%m/%Y") + + def extract_from_pdf(pdf, charge_dest, location_dest): """Build charge_dest and location_dest xlsx file from pdf""" loc_tables = [] @@ -27,8 +41,11 @@ def extract_from_pdf(pdf, charge_dest, location_dest): situation_loc_line = [ l for l in page_text.split("\n") if "SITUATION DES LOCATAIRES" in l ] + date = extract_date(page_text) + mois = date.strftime("%m") + annee = date.strftime("%Y") if situation_loc_line: - mois, annee = situation_loc_line[0].split(" ")[-2:] + # mois, annee = situation_loc_line[0].split(" ")[-2:] if loc_tables: loc_tables.append(page.extract_table()[1:]) else: