21 Commits

Author SHA1 Message Date
8b4d5826a5 add workflows
Some checks failed
pdf-auralia-build-and-publish / build-and-publish (push) Failing after 53s
2025-02-23 12:19:04 +01:00
6e0ffe9085 core: change pandas version 2024-10-16 06:47:55 +02:00
ab2fdb0541 Feat: make from_pdf importable and move plumber in it 2024-10-16 06:47:25 +02:00
0fc39ed317 Merge pull request 'Update dependency MarkupSafe to v2.1.5' (#17) from renovate/markupsafe-2.x into main
Reviewed-on: #17
2024-02-17 05:08:19 +00:00
a6d6681756 Merge branch 'main' into renovate/markupsafe-2.x 2024-02-17 05:08:08 +00:00
4eecb3a44c Merge pull request 'Update dependency Jinja2 to v3.1.3' (#16) from renovate/jinja2-3.x into main
Reviewed-on: #16
2024-02-17 05:07:48 +00:00
60da623323 Update dependency MarkupSafe to v2.1.5 2024-02-17 05:04:52 +00:00
1f1e3e2741 Update dependency Jinja2 to v3.1.3 2024-02-17 05:04:48 +00:00
2b3e935f39 Merge pull request 'Update dependency Send2Trash to v1.8.2' (#15) from renovate/send2trash-1.x into main
Reviewed-on: #15
2024-02-17 04:56:16 +00:00
ef63f22d44 Merge pull request 'Update dependency MarkupSafe to v2.1.3' (#14) from renovate/markupsafe-2.x into main
Reviewed-on: #14
2024-02-17 04:55:55 +00:00
1020ef9257 Update dependency Send2Trash to v1.8.2 2024-01-10 11:04:32 +00:00
39084ceebd Update dependency MarkupSafe to v2.1.3 2024-01-10 11:04:30 +00:00
7de6c8dd9c clean renovate.json 2024-01-10 10:46:45 +00:00
da3815eea6 activate renovate 2024-01-09 06:53:09 +00:00
45d343d810 Feat: add raise error when src does not exists 2024-01-02 22:22:58 +01:00
806227f202 Feat: add logging in join 2023-12-30 17:45:15 +01:00
7bf0c38883 Feat: add option for debugging 2023-12-30 17:25:40 +01:00
b15b059e2a Add debug 2023-12-27 19:58:12 +01:00
48e75358ac Fix: remove index in excel outputs 2023-10-05 15:22:14 +02:00
132e37267b Feat: logging and option about overwritting 2023-10-05 15:19:16 +02:00
f2bcf6241a Fix: rebuild join_excel 2023-10-05 15:10:39 +02:00
9 changed files with 1968 additions and 1180 deletions

View File

@@ -0,0 +1,28 @@
name: pdf-auralia-build-and-publish
on:
push:
tags:
- "v*"
jobs:
build-and-publish:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: "3.11"
- name: Build and publish
run: |
echo $GITHUB_REF
sed -i 's/version = "[^"]*"/version = "${GITHUB_REF##*/}"/g' pyproject.toml
curl -sSL https://install.python-poetry.org | python3 -
export PATH="/root/.local/bin:$PATH"
poetry --version
poetry build
poetry publish --username __token__ --password ${{ secrets.PYPI_TOKEN }}
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}

View File

@@ -0,0 +1 @@
from .extract import from_pdf

View File

@@ -38,8 +38,9 @@ def catch_malformed_table(tables):
return tables[0]
def from_pdf(pdf):
def from_pdf(pdf_file):
"""Build dataframes one about charges and another on loc"""
pdf = pdfplumber.open(pdf_file)
recapitulatif_tables = []
loc_tables = []
charge_tables = []
@@ -90,8 +91,7 @@ def extract_save(pdf_file, dest):
xls_charge = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
xls_locataire = Path(dest) / f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
pdf = pdfplumber.open(pdf_file)
df_charge, df_loc = from_pdf(pdf)
df_charge, df_loc = from_pdf(pdf_file)
df_charge.to_excel(xls_charge, sheet_name="Charges", index=False)
logging.info(f"{xls_charge} saved")

29
pdf_oralia/join.py Normal file
View File

@@ -0,0 +1,29 @@
import glob
import logging
import pandas as pd
def join_excel(src, dest, file_pattern):
"""Join every excel file in arc respecting file_pattern into on unique file in dist"""
filenames = list_files(src, file_pattern)
logging.debug(f"Concatenate {filenames}")
dfs = extract_dfs(filenames)
joined_df = pd.concat(dfs)
logging.debug(f"Writing joined excel to {dest}")
joined_df.to_excel(dest, index=False)
logging.debug(f"with {len(joined_df)} rows")
def list_files(src, file_glob):
return list(glob.iglob(f"{src}/{file_glob}"))
def extract_dfs(filenames):
dfs = []
for filename in filenames:
logging.debug(f"Extracting {filename}")
df = pd.read_excel(filename)
logging.debug(f"Found {len(df)} rows")
dfs.append(df)
return dfs

View File

@@ -5,29 +5,33 @@ from pathlib import Path
import click
from .extract import extract_save
logging_config = dict(
version=1,
formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
handlers={
"h": {
"class": "logging.StreamHandler",
"formatter": "f",
"level": logging.DEBUG,
}
},
root={
"handlers": ["h"],
"level": logging.DEBUG,
},
)
dictConfig(logging_config)
from .join import join_excel
@click.group()
def main():
pass
@click.option("--debug/--no-debug", default=False)
def main(debug):
if debug:
logging_level = logging.DEBUG
else:
logging_level = logging.INFO
logging_config = dict(
version=1,
formatters={"f": {"format": "%(levelname)-8s %(name)-12s %(message)s"}},
handlers={
"h": {
"class": "logging.StreamHandler",
"formatter": "f",
"level": logging_level,
}
},
root={
"handlers": ["h"],
"level": logging_level,
},
)
dictConfig(logging_config)
@main.group()
@@ -64,5 +68,31 @@ def all(src, dest):
@main.command()
@click.option("--src", help="Tous les fichiers dans src", default="./")
@click.option("--dest", help="Où mettre les fichiers produits", default="")
def join(src, dest):
join_excel(src, dest, df_names=["charge", "locataire"])
@click.option(
"--force",
help="Ecraser si le ficher destination existe.",
default=False,
is_flag=True,
)
def join(src, dest, force):
"""Join tous les fichiers excel charge (resp locataire) de src dans un seul fichier charge.xlsx dans dist.
Exemple:
pdf-oralia join --src <dossier_source> --dest <dossier_destination>
"""
dest_charge = f"{dest}/charge.xlsx"
if not force and Path(dest_charge).exists():
raise ValueError(f"The file {dest_charge} already exists")
dest_locataire = f"{dest}/locataire.xlsx"
if not force and Path(dest_locataire).exists():
raise ValueError(f"The file {dest_locataire} already exists")
if not Path(src).exists():
raise ValueError(f"The source directory ({src}) does not exists.")
join_excel(src, dest_charge, "*_charge.xlsx")
logging.info(f"Les données charges ont été concaténées dans {dest_charge}")
join_excel(src, dest_locataire, "*_locataire.xlsx")
logging.info(f"Les données locataires ont été concaténées dans {dest_locataire}")

2927
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -13,7 +13,7 @@ pdf-oralia = "pdf_oralia.scripts:main"
python = "^3.10"
click = "^8.1.3"
pdfplumber = "^0.7.4"
pandas = "^1.5.0"
pandas = "^2.2.3"
openpyxl = "^3.0.10"

2
renovate.json Normal file
View File

@@ -0,0 +1,2 @@
{
}

View File

@@ -1,76 +1,3 @@
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
asttokens==2.0.8
attrs==22.1.0
backcall==0.2.0
beautifulsoup4==4.11.1
bleach==5.0.1
cffi==1.15.1
charset-normalizer==2.1.1
cryptography==38.0.1
debugpy==1.6.3
decorator==5.1.1
defusedxml==0.7.1
entrypoints==0.4
et-xmlfile==1.1.0
executing==1.1.0
fastjsonschema==2.16.2
ipykernel==6.16.0
ipython==8.5.0
ipython-genutils==0.2.0
ipywidgets==8.0.2
jedi==0.18.1
Jinja2==3.1.2
jsonschema==4.16.0
jupyter==1.0.0
jupyter-console==6.4.4
jupyter-core==4.11.1
jupyter_client==7.3.5
jupyterlab-pygments==0.2.2
jupyterlab-widgets==3.0.3
lxml==4.9.1
MarkupSafe==2.1.1
matplotlib-inline==0.1.6
mistune==2.0.4
nbclient==0.6.8
nbconvert==7.0.0
nbformat==5.6.1
nest-asyncio==1.5.5
notebook==6.4.12
numpy==1.23.3
openpyxl==3.0.10
packaging==21.3
pandas==1.5.0
pandocfilters==1.5.0
parso==0.8.3
pdfminer.six==20220524
pdfplumber==0.7.4
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.2.0
prometheus-client==0.14.1
prompt-toolkit==3.0.31
psutil==5.9.2
ptyprocess==0.7.0
pure-eval==0.2.2
pycparser==2.21
Pygments==2.13.0
pyparsing==3.0.9
pyrsistent==0.18.1
python-dateutil==2.8.2
pytz==2022.2.1
pyzmq==24.0.1
qtconsole==5.3.2
QtPy==2.2.0
Send2Trash==1.8.0
six==1.16.0
soupsieve==2.3.2.post1
stack-data==0.5.1
terminado==0.15.0
tinycss2==1.1.1
tornado==6.2
traitlets==5.4.0
Wand==0.6.10
wcwidth==0.2.5
webencodings==0.5.1
widgetsnbextension==4.0.3
pdfplumber
numpy
pandas