2022-09-28 07:56:35 +00:00
|
|
|
import logging
|
2022-12-18 09:01:19 +00:00
|
|
|
from datetime import datetime
|
2022-09-27 12:48:41 +00:00
|
|
|
from pathlib import Path
|
2022-09-27 13:01:14 +00:00
|
|
|
|
2022-10-10 19:53:12 +00:00
|
|
|
import pandas as pd
|
2022-09-27 13:01:14 +00:00
|
|
|
import pdfplumber
|
2022-09-27 12:48:41 +00:00
|
|
|
|
2022-10-10 19:53:12 +00:00
|
|
|
from .extract_charge import extract_charge, extract_remise_com
|
2022-09-27 14:07:06 +00:00
|
|
|
from .extract_locataire import extract_situation_loc
|
2022-09-27 12:48:41 +00:00
|
|
|
|
|
|
|
charge_table_settings = {
|
|
|
|
"vertical_strategy": "lines",
|
|
|
|
"horizontal_strategy": "text",
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-12-18 09:01:19 +00:00
|
|
|
def extract_date(page_text):
|
|
|
|
"""Extract date from a page
|
|
|
|
|
|
|
|
:param page_text: text in the page
|
|
|
|
:return: the extracted date
|
|
|
|
"""
|
|
|
|
blocs = page_text.split("\n")
|
|
|
|
for b in blocs:
|
|
|
|
if "Lyon le" in b:
|
|
|
|
words = b.split(" ")
|
|
|
|
return datetime.strptime(words[-1], "%d/%m/%Y")
|
|
|
|
|
|
|
|
|
2022-09-27 12:48:41 +00:00
|
|
|
def extract_from_pdf(pdf, charge_dest, location_dest):
|
2022-09-27 14:01:09 +00:00
|
|
|
"""Build charge_dest and location_dest xlsx file from pdf"""
|
2022-09-28 13:23:21 +00:00
|
|
|
loc_tables = []
|
2022-09-28 18:43:31 +00:00
|
|
|
charge_table = []
|
2022-10-10 19:53:12 +00:00
|
|
|
|
|
|
|
df_1st_charge = extract_remise_com(
|
|
|
|
pdf.pages[0].extract_table(charge_table_settings)
|
|
|
|
)
|
|
|
|
|
2022-09-28 07:56:35 +00:00
|
|
|
for page in pdf.pages[1:]:
|
|
|
|
page_text = page.extract_text()
|
2022-09-27 12:48:41 +00:00
|
|
|
situation_loc_line = [
|
2022-09-28 07:56:35 +00:00
|
|
|
l for l in page_text.split("\n") if "SITUATION DES LOCATAIRES" in l
|
2022-09-27 12:48:41 +00:00
|
|
|
]
|
2022-12-18 09:01:19 +00:00
|
|
|
date = extract_date(page_text)
|
|
|
|
mois = date.strftime("%m")
|
|
|
|
annee = date.strftime("%Y")
|
2022-09-27 12:48:41 +00:00
|
|
|
if situation_loc_line:
|
2022-12-18 09:01:19 +00:00
|
|
|
# mois, annee = situation_loc_line[0].split(" ")[-2:]
|
2022-09-28 13:23:21 +00:00
|
|
|
if loc_tables:
|
|
|
|
loc_tables.append(page.extract_table()[1:])
|
2022-09-27 12:48:41 +00:00
|
|
|
else:
|
2022-09-28 13:23:21 +00:00
|
|
|
loc_tables.append(page.extract_table())
|
2022-09-27 12:48:41 +00:00
|
|
|
|
2022-09-28 18:43:31 +00:00
|
|
|
elif "RECAPITULATIF DES OPERATIONS" in page_text:
|
|
|
|
if charge_table:
|
|
|
|
charge_table += page.extract_table(charge_table_settings)[1:]
|
|
|
|
else:
|
|
|
|
charge_table = page.extract_table(charge_table_settings)
|
|
|
|
|
|
|
|
df_charge = extract_charge(charge_table)
|
2022-10-10 19:53:12 +00:00
|
|
|
df_charge_with_1st = pd.concat([df_1st_charge, df_charge])
|
|
|
|
df_charge_with_1st.to_excel(charge_dest, sheet_name="Charges", index=False)
|
2022-09-28 18:43:31 +00:00
|
|
|
logging.info(f"{charge_dest} saved")
|
2022-09-27 12:48:41 +00:00
|
|
|
|
2022-09-28 13:23:21 +00:00
|
|
|
df_loc = extract_situation_loc(loc_tables, mois=mois, annee=annee)
|
2022-09-27 19:14:27 +00:00
|
|
|
df_loc = df_loc.assign()
|
2022-09-27 12:48:41 +00:00
|
|
|
df_loc.to_excel(location_dest, sheet_name="Location", index=False)
|
2022-09-28 07:56:35 +00:00
|
|
|
logging.info(f"{location_dest} saved")
|
2022-09-27 12:48:41 +00:00
|
|
|
|
|
|
|
|
2022-09-27 14:01:09 +00:00
|
|
|
def extract_save(pdf_file, dest):
|
|
|
|
"""Extract charge and locataire for pdf_file and put xlsx file in dest"""
|
2022-09-27 12:48:41 +00:00
|
|
|
pdf_file = Path(pdf_file)
|
2022-09-28 03:33:09 +00:00
|
|
|
xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
|
|
|
|
xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
|
2022-09-27 12:48:41 +00:00
|
|
|
|
|
|
|
pdf = pdfplumber.open(pdf_file)
|
|
|
|
extract_from_pdf(pdf, xls_charge, xls_locataire)
|