Feat: extract date from top of page

This commit is contained in:
Bertrand Benjamin 2022-12-18 10:01:19 +01:00
parent e64fb63129
commit bde6a0dfc6
1 changed files with 18 additions and 1 deletions

View File

@ -1,4 +1,5 @@
import logging
from datetime import datetime
from pathlib import Path
import pandas as pd
@ -13,6 +14,19 @@ charge_table_settings = {
}
def extract_date(page_text):
"""Extract date from a page
:param page_text: text in the page
:return: the extracted date
"""
blocs = page_text.split("\n")
for b in blocs:
if "Lyon le" in b:
words = b.split(" ")
return datetime.strptime(words[-1], "%d/%m/%Y")
def extract_from_pdf(pdf, charge_dest, location_dest):
"""Build charge_dest and location_dest xlsx file from pdf"""
loc_tables = []
@ -27,8 +41,11 @@ def extract_from_pdf(pdf, charge_dest, location_dest):
situation_loc_line = [
l for l in page_text.split("\n") if "SITUATION DES LOCATAIRES" in l
]
date = extract_date(page_text)
mois = date.strftime("%m")
annee = date.strftime("%Y")
if situation_loc_line:
mois, annee = situation_loc_line[0].split(" ")[-2:]
# mois, annee = situation_loc_line[0].split(" ")[-2:]
if loc_tables:
loc_tables.append(page.extract_table()[1:])
else: