Feat: extract date from top of page
This commit is contained in:
parent
e64fb63129
commit
bde6a0dfc6
@ -1,4 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -13,6 +14,19 @@ charge_table_settings = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_date(page_text):
|
||||||
|
"""Extract date from a page
|
||||||
|
|
||||||
|
:param page_text: text in the page
|
||||||
|
:return: the extracted date
|
||||||
|
"""
|
||||||
|
blocs = page_text.split("\n")
|
||||||
|
for b in blocs:
|
||||||
|
if "Lyon le" in b:
|
||||||
|
words = b.split(" ")
|
||||||
|
return datetime.strptime(words[-1], "%d/%m/%Y")
|
||||||
|
|
||||||
|
|
||||||
def extract_from_pdf(pdf, charge_dest, location_dest):
|
def extract_from_pdf(pdf, charge_dest, location_dest):
|
||||||
"""Build charge_dest and location_dest xlsx file from pdf"""
|
"""Build charge_dest and location_dest xlsx file from pdf"""
|
||||||
loc_tables = []
|
loc_tables = []
|
||||||
@ -27,8 +41,11 @@ def extract_from_pdf(pdf, charge_dest, location_dest):
|
|||||||
situation_loc_line = [
|
situation_loc_line = [
|
||||||
l for l in page_text.split("\n") if "SITUATION DES LOCATAIRES" in l
|
l for l in page_text.split("\n") if "SITUATION DES LOCATAIRES" in l
|
||||||
]
|
]
|
||||||
|
date = extract_date(page_text)
|
||||||
|
mois = date.strftime("%m")
|
||||||
|
annee = date.strftime("%Y")
|
||||||
if situation_loc_line:
|
if situation_loc_line:
|
||||||
mois, annee = situation_loc_line[0].split(" ")[-2:]
|
# mois, annee = situation_loc_line[0].split(" ")[-2:]
|
||||||
if loc_tables:
|
if loc_tables:
|
||||||
loc_tables.append(page.extract_table()[1:])
|
loc_tables.append(page.extract_table()[1:])
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user