44 Commits

Author SHA1 Message Date
092b925b68 doc: add utilisation in readme 2025-02-26 09:08:17 +01:00
3c18bd5d81 Feat: script command all works recursively 2025-02-26 09:02:33 +01:00
4ee78a7e7b Feat: specify page type before extracting it 2025-02-26 05:58:38 +01:00
ce8cdc4c1e Feat: use fsm to extract lines from pdf 2025-02-26 05:54:44 +01:00
6e0ffe9085 core: change pandas version 2024-10-16 06:47:55 +02:00
ab2fdb0541 Feat: make from_pdf importable and move plumber in it 2024-10-16 06:47:25 +02:00
0fc39ed317 Merge pull request 'Update dependency MarkupSafe to v2.1.5' (#17) from renovate/markupsafe-2.x into main
Reviewed-on: #17
2024-02-17 05:08:19 +00:00
a6d6681756 Merge branch 'main' into renovate/markupsafe-2.x 2024-02-17 05:08:08 +00:00
4eecb3a44c Merge pull request 'Update dependency Jinja2 to v3.1.3' (#16) from renovate/jinja2-3.x into main
Reviewed-on: #16
2024-02-17 05:07:48 +00:00
60da623323 Update dependency MarkupSafe to v2.1.5 2024-02-17 05:04:52 +00:00
1f1e3e2741 Update dependency Jinja2 to v3.1.3 2024-02-17 05:04:48 +00:00
2b3e935f39 Merge pull request 'Update dependency Send2Trash to v1.8.2' (#15) from renovate/send2trash-1.x into main
Reviewed-on: #15
2024-02-17 04:56:16 +00:00
ef63f22d44 Merge pull request 'Update dependency MarkupSafe to v2.1.3' (#14) from renovate/markupsafe-2.x into main
Reviewed-on: #14
2024-02-17 04:55:55 +00:00
1020ef9257 Update dependency Send2Trash to v1.8.2 2024-01-10 11:04:32 +00:00
39084ceebd Update dependency MarkupSafe to v2.1.3 2024-01-10 11:04:30 +00:00
7de6c8dd9c clean renovate.json 2024-01-10 10:46:45 +00:00
da3815eea6 activate renovate 2024-01-09 06:53:09 +00:00
45d343d810 Feat: add raise error when src does not exists 2024-01-02 22:22:58 +01:00
806227f202 Feat: add logging in join 2023-12-30 17:45:15 +01:00
7bf0c38883 Feat: add option for debugging 2023-12-30 17:25:40 +01:00
b15b059e2a Add debug 2023-12-27 19:58:12 +01:00
48e75358ac Fix: remove index in excel outputs 2023-10-05 15:22:14 +02:00
132e37267b Feat: logging and option about overwritting 2023-10-05 15:19:16 +02:00
f2bcf6241a Fix: rebuild join_excel 2023-10-05 15:10:39 +02:00
ec9cc19be5 fix: remove when 2023-09-20 09:37:50 +02:00
0040dccd9a Feat: Handle get_lot when RECAPITULATIF is nan 2023-09-20 09:28:57 +02:00
b0333cddd8 fix: raise a warning when a page is not recognized 2023-09-20 09:27:40 +02:00
406b89fea1 Feat: publish tag on Matrix 2023-07-08 09:08:09 +02:00
812d392720 feat: publish to matrix
All checks were successful
continuous-integration/drone/push Build is passing
2023-07-08 09:06:25 +02:00
6b77980e6c Fix 7: change the default FOURNISSEUR 2023-07-07 21:26:00 +02:00
90c2d3689b Fix I4: drop row with "" on locataire ans Période 2023-07-05 18:13:41 +02:00
f9be31c090 Fix #3: replace empty string with np.nan 2023-07-05 17:49:25 +02:00
2761c3ed7b Feat: improve version name for drone 2023-06-30 13:51:04 +02:00
5692898137 Feat: lot s'adapte meme sans espace avant le tiret 2023-06-28 10:49:36 +02:00
44d4150910 Feat: remove Appartement in type 2023-06-28 10:44:56 +02:00
223f25130d Feat: type df columns 2023-06-28 10:30:40 +02:00
1a86b7bc26 Fix: remove useless import 2023-06-28 09:45:18 +02:00
c56241fe4c Doc: init README 2023-06-27 12:06:56 +02:00
ceebfb0a38 Feat: better extraction of lot 2023-06-27 12:06:56 +02:00
18c8282f63 Feat: format lot in locataire table 2023-06-27 12:06:56 +02:00
020fd41eab Feat: add tabulate for dev 2023-06-27 12:06:56 +02:00
8a55e6e2cc Feat: marche avec les pdfs tous ensembles 2023-06-27 12:06:56 +02:00
1afb2a32ab Fix: remove gitea actions 2023-06-27 12:04:42 +02:00
e1332e5e4e Fix: yaml syntax
Some checks failed
Gitea Actions Demo / Hello-Gitea-Actions (push) Failing after 11s
Publish to pipy / Build and publish Python 🐍 distributions 📦 to PyPI (push) Failing after 23s
2023-06-27 11:59:40 +02:00
20 changed files with 3985 additions and 2835 deletions

View File

@@ -11,8 +11,8 @@ steps:
- name: build-and-publish
image: python:3.11
commands:
- echo "Tag: ${DRONE_TAG}"
- sed -i "s/VERSION_PLACEHOLDER/${DRONE_TAG}/g" pyproject.toml
- echo ${DRONE_TAG}
- sed -i 's/version = "[^"]*"/version = "${DRONE_TAG}"/g' pyproject.toml
- curl -sSL https://install.python-poetry.org | python3 -
- export PATH="/root/.local/bin:$PATH"
- poetry --version
@@ -22,10 +22,18 @@ steps:
PYPI_TOKEN:
from_secret: pypi_token
when:
event:
include:
- tag
- name: Notify on matrix
image: plugins/matrix
environment:
MATRIX_ROOMID:
from_secret: MATRIX_ROOMID
MATRIX_ACCESSTOKEN:
from_secret: MATRIX_ACCESSTOKEN
MATRIX_USERID:
from_secret: MATRIX_USERID
settings:
homeserver: https://matrix.poneyworld.net
template: "Une nouvelle version (${DRONE_TAG}) de pdf-oralia est publiée!"
# Déclencheur de la pipeline
trigger:

View File

@@ -1,19 +0,0 @@
name: Gitea Actions Demo
run-name: ${{ gitea.actor }} is testing out Gitea Actions 🚀
on: [push]
jobs:
Hello-Gitea-Actions:
runs-on: ubuntu-latest
steps:
- name: Check out repository code
uses: actions/checkout@v3
- name: Set up Python 3.10
uses: https://github.com/actions/setup-python@v4
with:
python-version: '3.10.12'
- name: Run image
uses: https://github.com/abatilo/actions-poetry@v2
with:
poetry-version: 1.5
- name: View poetry --help
run: poetry --help

View File

@@ -1,31 +0,0 @@
name: Publish to pipy
run-name: ${{ gitea.repository }} publish 🚀 to pipy
on:
push:
tags:
- '*'
jobs:
build-n-publish:
name: Build and publish Python 🐍 distributions 📦 to PyPI
runs-on: ubuntu-latest
steps:
- name: Check out repository code
uses: actions/checkout@v3
- name: Set up Python 3.10
uses: https://github.com/actions/setup-python@v4
with:
python-version: '3.11'
- name: Run image
uses: https://github.com/abatilo/actions-poetry@v2
with:
poetry-version: 1.5
- name: Extract tag name
id: tag
run: echo ::set-output name=TAG_NAME::$(echo $GITHUB_REF | cut -d / -f 3)
- name: Update version in setup.py
run: >-
sed -i "s/{{VERSION_PLACEHOLDER}}/${{ steps.tag.outputs.TAG_NAME }}/g" pyproject.toml
- name: View poetry --help
run: poetry --help

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,23 @@
# PDF AURALIA
Extraction de fichiers de comptabilité en pdf vers xlsx.
## Utilisation
- Lancement sur un fichier pdf particulier
```bash
pdf_oralia extract on <pdf_file> --dest <where to put producted files>
```
- Lancement sur tous les fichiers d'un repertoire (récursivement )
```bash
pdf_oralia extract all --src <source folder> --dest <destination folder>
```
Cette commande reproduira la structure du dossier source dans destination. Seul les fichiers non existants seront traités. Par default, les fichiers déjà produits ne seront pas écrasés.
On peut ajouter les options suivantes:
- `--force`: pour écraser les fichiers déjà traités
- `--only-plan`: pour voir quels fichiers pourraient être créé sans le faire.

View File

@@ -0,0 +1 @@
from .extract import from_pdf

View File

@@ -1,14 +1,13 @@
import logging
from datetime import datetime
from pathlib import Path
import pandas as pd
import pdfplumber
from .extract_charge import extract_charge, extract_remise_com
from .extract_locataire import extract_situation_loc
from pdf_oralia.pages import charge, locataire, patrimoine
charge_table_settings = {
extract_table_settings = {
"vertical_strategy": "lines",
"horizontal_strategy": "text",
}
@@ -27,52 +26,109 @@ def extract_date(page_text):
return datetime.strptime(words[-1], "%d/%m/%Y")
def extract_from_pdf(pdf, charge_dest, location_dest):
"""Build charge_dest and location_dest xlsx file from pdf"""
loc_tables = []
charge_table = []
def extract_building(page_text, buildings=["bloch", "marietton", "servient"]):
for building in buildings:
if building in page_text.lower():
return building
raise ValueError("Pas d'immeuble trouvé")
df_1st_charge = extract_remise_com(
pdf.pages[0].extract_table(charge_table_settings)
)
for page in pdf.pages[1:]:
def pdf_extract_tables_lines(pdf):
loc_sink = locataire.fsm()
next(loc_sink)
charge_sink = charge.fsm()
next(charge_sink)
patrimoine_sink = patrimoine.fsm()
next(patrimoine_sink)
for page_number, page in enumerate(pdf.pages):
page_text = page.extract_text()
situation_loc_line = [
l for l in page_text.split("\n") if "SITUATION DES LOCATAIRES" in l
]
date = extract_date(page_text)
mois = date.strftime("%m")
annee = date.strftime("%Y")
if situation_loc_line:
# mois, annee = situation_loc_line[0].split(" ")[-2:]
if loc_tables:
loc_tables.append(page.extract_table()[1:])
else:
loc_tables.append(page.extract_table())
try:
additionnal_fields = {
"immeuble": extract_building(page_text),
"mois": date.strftime("%m"),
"annee": date.strftime("%Y"),
}
except ValueError:
logging.warning(
f"L'immeuble de la page {page_number+1} non identifiable. Page ignorée."
)
continue
table_type = ""
if locataire.is_it(page_text):
table_type = "locataire"
elif charge.is_it(page_text):
table_type = "charge"
elif patrimoine.is_it(page_text):
table_type = "patrimoine"
else:
logging.warning(
f"Type de la page {page_number+1} non identifiable. Page ignorée."
)
continue
elif "RECAPITULATIF DES OPERATIONS" in page_text:
if charge_table:
charge_table += page.extract_table(charge_table_settings)[1:]
else:
charge_table = page.extract_table(charge_table_settings)
for line in page.extract_table(extract_table_settings):
if table_type == "locataire":
res = loc_sink.send(line)
if res:
res.update(additionnal_fields)
yield locataire.Line(**res)
elif table_type == "charge":
res = charge_sink.send(line)
if res:
res.update(additionnal_fields)
yield charge.Line(**res)
df_charge = extract_charge(charge_table)
df_charge_with_1st = pd.concat([df_1st_charge, df_charge])
df_charge_with_1st.to_excel(charge_dest, sheet_name="Charges", index=False)
logging.info(f"{charge_dest} saved")
df_loc = extract_situation_loc(loc_tables, mois=mois, annee=annee)
df_loc = df_loc.assign()
df_loc.to_excel(location_dest, sheet_name="Location", index=False)
logging.info(f"{location_dest} saved")
elif table_type == "patrimoine":
res = patrimoine_sink.send(line)
if res:
res.update(additionnal_fields)
yield patrimoine.Line(**res)
def extract_save(pdf_file, dest):
def from_pdf(pdf_file):
"""Build dataframes one about charges and another on loc"""
pdf = pdfplumber.open(pdf_file)
locataire_lines = []
charge_lines = []
patrimoine_lines = []
for line in pdf_extract_tables_lines(pdf):
if isinstance(line, locataire.Line):
locataire_lines.append(line)
elif isinstance(line, charge.Line):
charge_lines.append(line)
elif isinstance(line, patrimoine.Line):
patrimoine_lines.append(line)
else:
logging.warning(f"Page {page_number+1} non reconnu. Page ignorée.")
return {
"charge": pd.DataFrame([c.__dict__ for c in charge_lines]),
"locataire": pd.DataFrame([c.__dict__ for c in locataire_lines]),
"patrimoine": pd.DataFrame([c.__dict__ for c in patrimoine_lines]),
}
def extract_plan(pdf_file, dest):
return {
"charge": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx",
"locataire": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx",
"patrimoine": Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_patrimoine.xlsx",
}
def extract_save(pdf_file, dest, save=[]):
"""Extract charge and locataire for pdf_file and put xlsx file in dest"""
pdf_file = Path(pdf_file)
xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
xlss = extract_plan(pdf_file, dest)
pdf = pdfplumber.open(pdf_file)
extract_from_pdf(pdf, xls_charge, xls_locataire)
if save != []:
dfs = from_pdf(pdf_file)
for s in save:
dfs[s].to_excel(xlss[s], sheet_name=s, index=False)
logging.info(f"{xlss[s]} saved")
return {k: v for k, v in xlss.items() if k in save}
return xlss

View File

@@ -1,68 +0,0 @@
import logging
import numpy as np
import pandas as pd
def get_lot(x):
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
if x[:2].isdigit():
return x[:2]
if x[:1].isdigit():
return "0" + x[:1]
if x[:2] == "PC":
return "PC"
return ""
def extract_charge(table):
"""From pdfplumber table extract the charge dataframe"""
df = (
pd.DataFrame(table[1:], columns=table[0])
.replace("", np.nan)
.dropna(subset=["Débits", "Crédits"], how="all")
)
drop_index = df[
df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False)
| df["RECAPITULATIF DES OPERATIONS"].str.contains("Solde créditeur", case=False)
| df["RECAPITULATIF DES OPERATIONS"].str.contains("Solde débiteur", case=False)
| df["RECAPITULATIF DES OPERATIONS"].str.contains(
"Total des reglements locataires", case=False
)
].index
df.drop(drop_index, inplace=True)
df[""].mask(
df["RECAPITULATIF DES OPERATIONS"].str.contains("honoraires", case=False),
"IMI GERANCE",
inplace=True,
)
df = df.assign(lot=df["RECAPITULATIF DES OPERATIONS"].map(get_lot))
df = df.astype(
{
"Débits": "float64",
"Crédits": "float64",
"Dont T.V.A.": "float64",
"Locatif": "float64",
"Déductible": "float64",
}
)
df.columns.values[0] = "Fournisseur"
return df
def extract_remise_com(table):
"""Extract "remise commercial" from first page"""
df = pd.DataFrame(table[1:], columns=table[0]).replace("", np.nan)
df = df[
df["RECAPITULATIF DES OPERATIONS"].str.contains(
"Remise commerciale gérance", case=False, na=False
)
]
df.columns.values[0] = "Fournisseur"
return df

View File

@@ -1,81 +0,0 @@
import logging
import pandas as pd
def parse_above_loc(content):
row = {}
app, loc, *_ = content.split("\n")
app_ = app.split(" ")
row["lot"] = f"{int(app_[1]):02d}"
row["type"] = " ".join(app_[2:])
row["locataire"] = loc
return pd.Series(row)
def join_row(last, next):
row = []
for i in range(len(last)):
if last[i] and next[i]:
row.append(f"{last[i]}\n{next[i]}")
elif last[i]:
row.append(last[i])
elif next[i]:
row.append(next[i])
else:
row.append("")
return row
def join_tables(tables):
joined = tables[0]
for t in tables[1:]:
last_row = joined[-1]
if "Totaux" not in last_row[0]:
first_row = t[0]
joined_row = join_row(last_row, first_row)
joined = joined[:-1] + [joined_row] + t[1:]
else:
joined += t
return joined
def extract_situation_loc(tables, mois, annee):
"""From pdfplumber table extract locataire df"""
table = join_tables(tables)
try:
df = pd.DataFrame(table[1:], columns=table[0])
except IndexError:
print(table)
rows = []
for i, row in df[df["Locataires"] == "Totaux"].iterrows():
above_row_loc = df.iloc[i - 1]["Locataires"]
up_row = pd.concat(
[
row,
parse_above_loc(above_row_loc),
]
)
rows.append(up_row)
df_cleaned = pd.concat(rows, axis=1).T
df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True)
df_cleaned = df_cleaned.astype(
{
"Loyers": "float64",
"Taxes": "float64",
"Provisions": "float64",
"Divers": "float64",
"Total": "float64",
"Réglés": "float64",
"Impayés": "float64",
},
errors="ignore",
)
df_cleaned = df_cleaned.assign(mois=mois, annee=annee)
return df_cleaned

View File

@@ -1,30 +1,29 @@
import glob
import logging
from pathlib import Path
import pandas as pd
def extract_excel_to_dfs(directory, df_names=["charge", "locataire"]):
p = Path(directory)
dfs = {name: [] for name in df_names}
def join_excel(src, dest, file_pattern):
"""Join every excel file in arc respecting file_pattern into on unique file in dist"""
filenames = list_files(src, file_pattern)
logging.debug(f"Concatenate {filenames}")
dfs = extract_dfs(filenames)
joined_df = pd.concat(dfs)
logging.debug(f"Writing joined excel to {dest}")
joined_df.to_excel(dest, index=False)
logging.debug(f"with {len(joined_df)} rows")
for file in p.glob("*.xlsx"):
year, month, immeuble, table = file.stem.split("_")
df = pd.read_excel(file, dtype={"lot": str}).assign(
annee=year, mois=month, immeuble=immeuble[:3]
)
dfs[table].append(df)
def list_files(src, file_glob):
return list(glob.iglob(f"{src}/{file_glob}"))
def extract_dfs(filenames):
dfs = []
for filename in filenames:
logging.debug(f"Extracting {filename}")
df = pd.read_excel(filename)
logging.debug(f"Found {len(df)} rows")
dfs.append(df)
return dfs
def join_excel(directory, dest, df_names=["charge", "locataire"]):
dfs = extract_excel_to_dfs(directory, df_names)
destinations = {}
for tablename, datas in dfs.items():
df = pd.concat(datas)
destination = Path(dest) / f"{tablename}.xlsx"
df.to_excel(destination, index=False)
destinations[tablename] = destination
logging.info(f"{destination} written")
return destinations

View File

@@ -0,0 +1 @@
from . import charge, locataire, patrimoine, recapitulatif

124
pdf_oralia/pages/charge.py Normal file
View File

@@ -0,0 +1,124 @@
import re
from pydantic import BaseModel, field_validator
HEADER_CHARGE = [
"",
"RECAPITULATIF DES OPERATIONS",
"Débits",
"Crédits",
"Dont T.V.A.",
"Locatif",
"Déductible",
]
DF_TYPES = {
"Fournisseur": str,
"RECAPITULATIF DES OPERATIONS": str,
"Débits": float,
"Crédits": float,
"Dont T.V.A.": float,
"Locatif": float,
"Déductible": float,
"immeuble": str,
"mois": str,
"annee": str,
"lot": str,
}
class Line(BaseModel):
mois: int
annee: int
immeuble: str
lot: str
Champs: str
Categorie: str
Fournisseur: str
Libellé: str
Débit: float
Crédits: float
Dont_TVA: float
Locatif: float
Déductible: float
@field_validator(
"Débit", "Crédits", "Dont_TVA", "Locatif", "Déductible", mode="before"
)
def set_default_if_empty(cls, v):
if v == "":
return 0
return v
def is_it(page_text):
if (
"RECAPITULATIF DES OPERATIONS" in page_text
and "COMPTE RENDU DE GESTION" not in page_text
):
return True
return False
def get_lot(txt):
"""Return lot number from "RECAPITULATIF DES OPERATIONS" """
regex = r"[BSM](\d+)(?=\s*-)"
try:
result = re.findall(regex, txt)
except TypeError:
return "*"
if result:
return "{:02d}".format(int(result[0]))
return "*"
def fsm():
current_state = "total"
row = {}
line = yield
while True:
if line == HEADER_CHARGE:
line = yield
if current_state == "total":
if line[1].lower().split(" ")[0] in ["total", "totaux"]:
current_state = "new_champs"
line = yield
elif current_state == "new_champs":
if line[0] != "":
current_state = "new_cat_line"
row = {"Champs": line[0], "Categorie": "", "Fournisseur": ""}
line = yield
elif current_state == "new_cat_line":
if line[1].lower().split(" ")[0] in ["total", "totaux"]:
current_state = "new_champs"
line = yield
row = {}
elif line[2] != "" or line[3] != "":
row.update(
{
"Fournisseur": line[0] if line[0] != "" else row["Fournisseur"],
"Libellé": line[1],
"lot": get_lot(line[1]),
"Débit": line[2],
"Crédits": line[3],
"Dont_TVA": line[4],
"Locatif": line[5],
"Déductible": line[6],
}
)
line = yield row
row = {
"Champs": row["Champs"],
"Categorie": row["Categorie"],
"Fournisseur": row["Fournisseur"],
}
elif line[0] != "" and line[1] == "":
row.update({"Categorie": line[0]})
line = yield
elif line[1] != "":
row.update({"Categorie": line[1]})
line = yield
elif line[0] != "":
row.update({"Fournisseur": line[0]})
line = yield
else:
line = yield

View File

@@ -0,0 +1,93 @@
from pydantic import BaseModel, field_validator
HEADER_LOC = [
"Locataires",
"Période",
"Loyers",
"Taxes",
"Provisions",
"Divers",
"",
"Total",
"Réglés",
"Impayés",
]
class Line(BaseModel):
mois: int
annee: int
immeuble: str
Lot: str
Type: str
Locataire: str
Loyers: float
Taxes: float
Provisions: float
Divers: float
Total: float
Réglés: float
Impayés: float
@field_validator(
"Loyers",
"Taxes",
"Provisions",
"Divers",
"Total",
"Réglés",
"Impayés",
mode="before",
)
def set_default_if_empty(cls, v):
if v == "":
return 0
return v
def is_it(page_text):
if "SITUATION DES LOCATAIRES" in page_text:
return True
return False
def parse_lot(string):
words = string.split(" ")
return {"Lot": "{:02d}".format(int(words[1])), "Type": " ".join(words[2:])}
def fsm():
current_state = "new_row"
row = {}
line = yield
while True:
if line == HEADER_LOC:
line = yield
elif current_state == "new_row":
if line[0] != "" and line[0] != "TOTAUX":
row.update(parse_lot(line[0]))
current_state = "add_loc"
line = yield
elif current_state == "add_loc":
if line[0] != "":
row["Locataire"] = line[0]
current_state = "add_totaux"
line = yield
elif current_state == "add_totaux":
if line[0] == "Totaux":
row.update(
{
"Loyers": line[2],
"Taxes": line[3],
"Provisions": line[4],
"Divers": line[5],
"Total": line[7],
"Réglés": line[8],
"Impayés": line[9],
}
)
line = yield row
row = {}
current_state = "new_row"
else:
line = yield

View File

@@ -0,0 +1,74 @@
from pydantic import BaseModel, field_validator
HEADER_PATRIMOINE = [
"Etage",
"Lots",
"Type de lot",
"Nom du Locataire",
"Loyer Annuel",
"Début Bail",
"Fin Bail",
"Entrée",
"Départ",
"Révisé le",
"U",
"Dépôt Gar.",
]
class Line(BaseModel):
mois: int
annee: int
immeuble: str
Etage: str
Lot: str
Type: str
Locataire: str
Loyer_annuel: int
Debut_bail: str
Fin_bail: str
Entree: str
Depart: str
Revision_bail: str
Usage: str
Depot_garantie: float
@field_validator("Loyer_annuel", "Depot_garantie", mode="before")
def set_default_if_empty(cls, v):
if v == "":
return 0
return v
def is_it(page_text):
if "VOTRE PATRIMOINE" in page_text:
return True
return False
def fsm():
current_state = "new_line"
row = {}
line = yield
while True:
if line == HEADER_PATRIMOINE:
line = yield
if current_state == "new_line":
if line[0] != "":
row = {
"Etage": line[0],
"Lot": line[1][-2:] if line[1] != "" else row["Lot"],
"Type": line[2] if line[2] != "" else row["Type"],
"Locataire": line[3],
"Loyer_annuel": line[4].replace(" ", ""),
"Debut_bail": line[5],
"Fin_bail": line[6],
"Entree": line[7],
"Depart": line[8],
"Revision_bail": line[9],
"Usage": line[10],
"Depot_garantie": line[11].replace(" ", ""),
}
line = yield row
else:
line = yield

View File

@@ -0,0 +1,34 @@
import numpy as np
import pandas as pd
def is_it(page_text):
if "COMPTE RENDU DE GESTION" in page_text:
return True
return False
def extract(table, additionnal_fields: dict = {}):
"""Extract "remise commercial" from first page"""
extracted = []
header = table[0]
for row in table[1:]:
if "Remise commerciale gérance" in row:
r = dict()
for i, value in enumerate(row):
r[header[i]] = value
for k, v in additionnal_fields.items():
r[k] = v
extracted.append(r)
return extracted
# df = pd.DataFrame(table[1:], columns=table[0]).replace("", np.nan)
# df = df[
# df["RECAPITULATIF DES OPERATIONS"].str.contains(
# "Remise commerciale gérance", case=False, na=False
# )
# ]
#
# df.columns.values[0] = "Fournisseur"
# return df

View File

@@ -3,33 +3,35 @@ from logging.config import dictConfig
from pathlib import Path
import click
import pandas as pd
from .extract import extract_save
from .extract import extract_save, extract_plan
from .join import join_excel
logging_config = dict(
version=1,
formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
handlers={
"h": {
"class": "logging.StreamHandler",
"formatter": "f",
"level": logging.DEBUG,
}
},
root={
"handlers": ["h"],
"level": logging.DEBUG,
},
)
dictConfig(logging_config)
@click.group()
def main():
pass
@click.option("--debug/--no-debug", default=False)
def main(debug):
if debug:
logging_level = logging.DEBUG
else:
logging_level = logging.INFO
logging_config = dict(
version=1,
formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
handlers={
"h": {
"class": "logging.StreamHandler",
"formatter": "f",
"level": logging_level,
}
},
root={
"handlers": ["h"],
"level": logging_level,
},
)
dictConfig(logging_config)
@main.group()
@@ -49,22 +51,75 @@ def on(pdf_file, dest):
@extract.command()
@click.option("--src", help="Tous les fichiers dans folder", default="./")
@click.option(
"--src", help="Tous les fichiers dans folder (de façon récursive)", default="./"
)
@click.option("--dest", help="Où mettre les fichiers produits", default="./")
def all(src, dest):
p = Path(src)
@click.option(
"--only-plan",
help="Ne produit rien mais indique les changements",
default=False,
is_flag=True,
)
@click.option(
"--force",
help="Écrase les fichiers produits précédemment",
default=False,
is_flag=True,
)
def all(src, dest, force, only_plan):
src_path = Path(src)
d = Path(dest)
d.mkdir(exist_ok=True)
dest = Path(dest)
dest.mkdir(exist_ok=True)
pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)]
for pdf_file in pdf_files:
for pdf_file in src_path.rglob("**/*.pdf"):
relative_path = pdf_file.relative_to(src_path)
files_dest = dest / relative_path.parent
logging.info(f"Found {pdf_file}")
extract_save(pdf_file, d)
plan_dest = extract_plan(pdf_file, files_dest)
save = []
for k, p in plan_dest.items():
if not p.exists() or force:
save.append(k)
if only_plan:
for s in save:
logging.info(f"Planing to create {plan_dest[s]}")
else:
files_dest.mkdir(parents=True, exist_ok=True)
extract_save(pdf_file, files_dest, save)
@main.command()
@click.option("--src", help="Tous les fichiers dans src", default="./")
@click.option("--dest", help="Où mettre les fichiers produits", default="")
def join(src, dest):
join_excel(src, dest, df_names=["charge", "locataire"])
@click.option(
"--force",
help="Ecraser si le ficher destination existe.",
default=False,
is_flag=True,
)
def join(src, dest, force):
"""Join tous les fichiers excel charge (resp locataire) de src dans un seul fichier charge.xlsx dans dist.
Exemple:
pdf-oralia join --src <dossier_source> --dest <dossier_destination>
"""
dest_charge = f"{dest}/charge.xlsx"
if not force and Path(dest_charge).exists():
raise ValueError(f"The file {dest_charge} already exists")
dest_locataire = f"{dest}/locataire.xlsx"
if not force and Path(dest_locataire).exists():
raise ValueError(f"The file {dest_locataire} already exists")
if not Path(src).exists():
raise ValueError(f"The source directory ({src}) does not exists.")
join_excel(src, dest_charge, "*_charge.xlsx")
logging.info(f"Les données charges ont été concaténées dans {dest_charge}")
join_excel(src, dest_locataire, "*_locataire.xlsx")
logging.info(f"Les données locataires ont été concaténées dans {dest_locataire}")

4516
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdf-oralia"
version = "VERSION_PLACEHOLDER"
version = "0"
description = ""
authors = ["Bertrand Benjamin <benjamin.bertrand@opytex.org>"]
readme = "README.md"
@@ -13,13 +13,14 @@ pdf-oralia = "pdf_oralia.scripts:main"
python = "^3.10"
click = "^8.1.3"
pdfplumber = "^0.7.4"
pandas = "^1.5.0"
pandas = "^2.2.3"
openpyxl = "^3.0.10"
[tool.poetry.group.dev.dependencies]
pre-commit = "^2.20.0"
jupyter = "^1.0.0"
tabulate = "^0.9.0"
[build-system]
requires = ["poetry-core"]

2
renovate.json Normal file
View File

@@ -0,0 +1,2 @@
{
}

View File

@@ -1,76 +1,5 @@
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
asttokens==2.0.8
attrs==22.1.0
backcall==0.2.0
beautifulsoup4==4.11.1
bleach==5.0.1
cffi==1.15.1
charset-normalizer==2.1.1
cryptography==38.0.1
debugpy==1.6.3
decorator==5.1.1
defusedxml==0.7.1
entrypoints==0.4
et-xmlfile==1.1.0
executing==1.1.0
fastjsonschema==2.16.2
ipykernel==6.16.0
ipython==8.5.0
ipython-genutils==0.2.0
ipywidgets==8.0.2
jedi==0.18.1
Jinja2==3.1.2
jsonschema==4.16.0
jupyter==1.0.0
jupyter-console==6.4.4
jupyter-core==4.11.1
jupyter_client==7.3.5
jupyterlab-pygments==0.2.2
jupyterlab-widgets==3.0.3
lxml==4.9.1
MarkupSafe==2.1.1
matplotlib-inline==0.1.6
mistune==2.0.4
nbclient==0.6.8
nbconvert==7.0.0
nbformat==5.6.1
nest-asyncio==1.5.5
notebook==6.4.12
numpy==1.23.3
openpyxl==3.0.10
packaging==21.3
pandas==1.5.0
pandocfilters==1.5.0
parso==0.8.3
pdfminer.six==20220524
pdfplumber==0.7.4
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.2.0
prometheus-client==0.14.1
prompt-toolkit==3.0.31
psutil==5.9.2
ptyprocess==0.7.0
pure-eval==0.2.2
pycparser==2.21
Pygments==2.13.0
pyparsing==3.0.9
pyrsistent==0.18.1
python-dateutil==2.8.2
pytz==2022.2.1
pyzmq==24.0.1
qtconsole==5.3.2
QtPy==2.2.0
Send2Trash==1.8.0
six==1.16.0
soupsieve==2.3.2.post1
stack-data==0.5.1
terminado==0.15.0
tinycss2==1.1.1
tornado==6.2
traitlets==5.4.0
Wand==0.6.10
wcwidth==0.2.5
webencodings==0.5.1
widgetsnbextension==4.0.3
pdfplumber
numpy
pandas
click
openpyxl