From 546f4d1949a3e532930035cf5e592ea728a91778 Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Tue, 27 Sep 2022 14:48:41 +0200 Subject: [PATCH] Feat: init repository --- .gitignore | 1 + Extract pdf.ipynb | 679 ++++++++++++++++++++++++++++++++++++++++ README.md | 0 pdf_auralia/__init__.py | 0 pdf_auralia/extract.py | 116 +++++++ poetry.lock | 8 + pyproject.toml | 15 + requirements.txt | 76 +++++ tests/__init__.py | 0 9 files changed, 895 insertions(+) create mode 100644 .gitignore create mode 100644 Extract pdf.ipynb create mode 100644 README.md create mode 100644 pdf_auralia/__init__.py create mode 100644 pdf_auralia/extract.py create mode 100644 poetry.lock create mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 tests/__init__.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..44d6228 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +pdfs/ diff --git a/Extract pdf.ipynb b/Extract pdf.ipynb new file mode 100644 index 0000000..6c37221 --- /dev/null +++ b/Extract pdf.ipynb @@ -0,0 +1,679 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e24ca74b", + "metadata": {}, + "source": [ + "# Extraction des informations pour Oralia" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1ac85f0c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import pdfplumber\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6b246985", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b80265f1", + "metadata": {}, + "outputs": [], + "source": [ + "pdf_file = Path(\"./pdfs/2022 04 Servient.pdf\")\n", + "pdf = pdfplumber.open(pdf_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a62448d8", + "metadata": {}, + "outputs": [], + "source": [ + "xls_charge = f\"{pdf_file.stem.replace(' ', '_')}_charge.xlsx\"\n", + "xls_locataire = f\"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx\"" + ] + }, + { + "cell_type": "markdown", + "id": "1f503cf5", + "metadata": {}, + "source": [ + "## Page 1: Récapitulatif" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ae9eb950", + "metadata": {}, + "outputs": [], + "source": [ + "p1 = pdf.pages[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "32ef66d7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"SITUATION DES LOCATAIRES\" in p1.extract_text()" + ] + }, + { + "cell_type": "markdown", + "id": "50dd9c09", + "metadata": {}, + "source": [ + "## Situation des locataires" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e9c0aefd", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_situation_loc(table):\n", + " df = pd.DataFrame(table[1:], columns=table[0])\n", + " rows = []\n", + " for i, row in df[df[\"Locataires\"]==\"Totaux\"].iterrows():\n", + " above_row_loc = df.iloc[i-1][\"Locataires\"]\n", + " up_row = pd.concat([row, \n", + " parse_above_loc(above_row_loc),\n", + " ])\n", + "\n", + " rows.append(up_row)\n", + " df_cleaned = pd.concat(rows, axis=1).T\n", + " df_cleaned.drop([\"Locataires\", \"\", \"Période\"], axis=1, inplace=True)\n", + " return df_cleaned\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "87e05f50", + "metadata": {}, + "outputs": [], + "source": [ + "def parse_above_loc(content):\n", + " row = {}\n", + " try:\n", + " app, loc = content.split(\"\\n\")\n", + " except ValueError:\n", + " row[\"lot\"] = \"\"\n", + " row[\"type\"] = \"\"\n", + " row[\"locataire\"] = content\n", + " \n", + " else:\n", + " app_ = app.split(\" \")\n", + " row[\"lot\"] = app_[1]\n", + " row[\"type\"] = \" \".join(app_[2:])\n", + " row[\"locataire\"] = loc\n", + " return pd.Series(row)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19a1446d", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8afb23c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LoyersTaxesProvisionsDiversTotalRéglésImpayéslottypelocataire
0342.800.00663.000.001005.801005.800001Loc. CommercialEFFUSION
13473.790.00519.0096.794089.584089.580002Loc. CommercialRAS
2597.200.0031.000.00628.60628.600003Appartement T1KALAI Bernard
3596.590.0031.000.00627.59627.590004Appartement T2PEJAUDIER Adelaide
4468.850.0020.000.00981.03485.00496.030009Appartement T1MANNA Baptiste
5745.390.00191.000.00936.39936.390005Loc. CommercialATELIERS RENAISSANCE
6834.550.0081.000.00915.55915.550006Appartement T3GUELLIER MURIEL
7591.690.0050.000.00641.69641.69Lot 0007 Appartement T1\\nDOMINIKIEWICZ\\nMELANIE
8574.710.0028.000.00602.71602.710008Appartement T1BESSON Léa
91201.100.0087.000.001288.101288.100010Appartement T3FILIPPI Bérengère
10500.460.0028.000.00528.46528.460011Appartement T1LOINE Anaïs
\n", + "
" + ], + "text/plain": [ + " Loyers Taxes Provisions Divers Total Réglés Impayés lot \\\n", + "0 342.80 0.00 663.00 0.00 1005.80 1005.80 0001 \n", + "1 3473.79 0.00 519.00 96.79 4089.58 4089.58 0002 \n", + "2 597.20 0.00 31.00 0.00 628.60 628.60 0003 \n", + "3 596.59 0.00 31.00 0.00 627.59 627.59 0004 \n", + "4 468.85 0.00 20.00 0.00 981.03 485.00 496.03 0009 \n", + "5 745.39 0.00 191.00 0.00 936.39 936.39 0005 \n", + "6 834.55 0.00 81.00 0.00 915.55 915.55 0006 \n", + "7 591.69 0.00 50.00 0.00 641.69 641.69 \n", + "8 574.71 0.00 28.00 0.00 602.71 602.71 0008 \n", + "9 1201.10 0.00 87.00 0.00 1288.10 1288.10 0010 \n", + "10 500.46 0.00 28.00 0.00 528.46 528.46 0011 \n", + "\n", + " type locataire \n", + "0 Loc. Commercial EFFUSION \n", + "1 Loc. Commercial RAS \n", + "2 Appartement T1 KALAI Bernard \n", + "3 Appartement T2 PEJAUDIER Adelaide \n", + "4 Appartement T1 MANNA Baptiste \n", + "5 Loc. Commercial ATELIERS RENAISSANCE \n", + "6 Appartement T3 GUELLIER MURIEL \n", + "7 Lot 0007 Appartement T1\\nDOMINIKIEWICZ\\nMELANIE \n", + "8 Appartement T1 BESSON Léa \n", + "9 Appartement T3 FILIPPI Bérengère \n", + "10 Appartement T1 LOINE Anaïs " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p2 = pdf.pages[1]\n", + "extract_situation_loc(p2.extract_table())" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0e0ddca7", + "metadata": {}, + "outputs": [], + "source": [ + "charge_table_settings = {\n", + " \"vertical_strategy\": \"lines\",\n", + " \"horizontal_strategy\": \"text\",\n", + "}\n", + "def extract_charge(table):\n", + " df = pd.DataFrame(table[1:], columns=table[0]).replace(\"\", np.nan).dropna(subset=[\"Débits\"])\n", + " drop_index = df[df[\"RECAPITULATIF DES OPERATIONS\"].str.contains(\"TOTAUX\", case=False)].index\n", + " df.drop(drop_index, inplace=True)\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b915b220", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RECAPITULATIF DES OPERATIONSDébitsCréditsDont T.V.A.LocatifDéductible
4DIDIER NETTOYAGEPC - ENTRETIEN IMMEUBLE708.58NaN118.10708.58NaN
6TOTAL DIRECT ENERGIEPC TOTAL DIRECT ENERGIE65.70NaN7.0365.70NaN
7EDFPC EDF DU 17.04.202289.56NaN10.2289.56NaN
9PICARD SERVICEFacture du 11/04/202266.76NaN6.0766.76NaN
15V2C MAINTENANCE6 - remplacement circulateur chudière447.70NaN40.70NaN447.70
20IMI GERANCETAVARES NORTE Dylan93.00NaN15.50NaN93.00
23IMI GERANCETAVARES NORTE Dylan173.58NaN28.93NaN173.58
27IMI GERANCETAVARES NORTE Dylan798.72NaN133.12NaN798.72
29NaNHonoraires H.T.979.20NaNNaNNaN979.20
30NaNTVA/Honoraires ( 20.00 % )195.84NaN195.84NaN195.84
\n", + "
" + ], + "text/plain": [ + " RECAPITULATIF DES OPERATIONS Débits \\\n", + "4 DIDIER NETTOYAGE PC - ENTRETIEN IMMEUBLE 708.58 \n", + "6 TOTAL DIRECT ENERGIE PC TOTAL DIRECT ENERGIE 65.70 \n", + "7 EDF PC EDF DU 17.04.2022 89.56 \n", + "9 PICARD SERVICE Facture du 11/04/2022 66.76 \n", + "15 V2C MAINTENANCE 6 - remplacement circulateur chudière 447.70 \n", + "20 IMI GERANCE TAVARES NORTE Dylan 93.00 \n", + "23 IMI GERANCE TAVARES NORTE Dylan 173.58 \n", + "27 IMI GERANCE TAVARES NORTE Dylan 798.72 \n", + "29 NaN Honoraires H.T. 979.20 \n", + "30 NaN TVA/Honoraires ( 20.00 % ) 195.84 \n", + "\n", + " Crédits Dont T.V.A. Locatif Déductible \n", + "4 NaN 118.10 708.58 NaN \n", + "6 NaN 7.03 65.70 NaN \n", + "7 NaN 10.22 89.56 NaN \n", + "9 NaN 6.07 66.76 NaN \n", + "15 NaN 40.70 NaN 447.70 \n", + "20 NaN 15.50 NaN 93.00 \n", + "23 NaN 28.93 NaN 173.58 \n", + "27 NaN 133.12 NaN 798.72 \n", + "29 NaN NaN NaN 979.20 \n", + "30 NaN 195.84 NaN 195.84 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "p4 = pdf.pages[3]\n", + "extract_charge(p4.extract_table(charge_table_settings))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c7b071fa", + "metadata": {}, + "outputs": [], + "source": [ + "# im = p4.to_image()\n", + "# im.debug_tablefinder(charge_table_settings)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ebe2881a", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'openpyxl'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn [12], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHONORAIRES\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m page\u001b[38;5;241m.\u001b[39mextract_text():\n\u001b[1;32m 13\u001b[0m df_charge \u001b[38;5;241m=\u001b[39m extract_charge(page\u001b[38;5;241m.\u001b[39mextract_table(charge_table_settings))\n\u001b[0;32m---> 15\u001b[0m df_charge\u001b[38;5;241m.\u001b[39mto_excel(xls_charge, sheet_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCharges\u001b[39m\u001b[38;5;124m\"\u001b[39m, index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 18\u001b[0m df_loc \u001b[38;5;241m=\u001b[39m extract_situation_loc(loc_table)\n\u001b[1;32m 19\u001b[0m df_loc \u001b[38;5;241m=\u001b[39m df_loc\u001b[38;5;241m.\u001b[39massign(\n\u001b[1;32m 20\u001b[0m mois \u001b[38;5;241m=\u001b[39m mois,\n\u001b[1;32m 21\u001b[0m annee \u001b[38;5;241m=\u001b[39m annee\n\u001b[1;32m 22\u001b[0m )\n", + "File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/util/_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.._deprecate_kwarg..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 210\u001b[0m kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[0;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/util/_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.._deprecate_kwarg..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 210\u001b[0m kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[0;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/core/generic.py:2373\u001b[0m, in \u001b[0;36mNDFrame.to_excel\u001b[0;34m(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, encoding, inf_rep, verbose, freeze_panes, storage_options)\u001b[0m\n\u001b[1;32m 2360\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mformats\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexcel\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ExcelFormatter\n\u001b[1;32m 2362\u001b[0m formatter \u001b[38;5;241m=\u001b[39m ExcelFormatter(\n\u001b[1;32m 2363\u001b[0m df,\n\u001b[1;32m 2364\u001b[0m na_rep\u001b[38;5;241m=\u001b[39mna_rep,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2371\u001b[0m inf_rep\u001b[38;5;241m=\u001b[39minf_rep,\n\u001b[1;32m 2372\u001b[0m )\n\u001b[0;32m-> 2373\u001b[0m \u001b[43mformatter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2374\u001b[0m \u001b[43m \u001b[49m\u001b[43mexcel_writer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2375\u001b[0m \u001b[43m \u001b[49m\u001b[43msheet_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msheet_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2376\u001b[0m \u001b[43m \u001b[49m\u001b[43mstartrow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstartrow\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2377\u001b[0m \u001b[43m \u001b[49m\u001b[43mstartcol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstartcol\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2378\u001b[0m \u001b[43m \u001b[49m\u001b[43mfreeze_panes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfreeze_panes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2379\u001b[0m \u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2380\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2381\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/io/formats/excel.py:911\u001b[0m, in \u001b[0;36mExcelFormatter.write\u001b[0;34m(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options)\u001b[0m\n\u001b[1;32m 907\u001b[0m need_save \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 908\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 909\u001b[0m \u001b[38;5;66;03m# error: Cannot instantiate abstract class 'ExcelWriter' with abstract\u001b[39;00m\n\u001b[1;32m 910\u001b[0m \u001b[38;5;66;03m# attributes 'engine', 'save', 'supported_extensions' and 'write_cells'\u001b[39;00m\n\u001b[0;32m--> 911\u001b[0m writer \u001b[38;5;241m=\u001b[39m \u001b[43mExcelWriter\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[abstract]\u001b[39;49;00m\n\u001b[1;32m 912\u001b[0m \u001b[43m \u001b[49m\u001b[43mwriter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\n\u001b[1;32m 913\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 914\u001b[0m need_save \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 916\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/io/excel/_openpyxl.py:56\u001b[0m, in \u001b[0;36mOpenpyxlWriter.__init__\u001b[0;34m(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 45\u001b[0m path: FilePath \u001b[38;5;241m|\u001b[39m WriteExcelBuffer \u001b[38;5;241m|\u001b[39m ExcelWriter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 54\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 55\u001b[0m \u001b[38;5;66;03m# Use the openpyxl module as the Excel writer.\u001b[39;00m\n\u001b[0;32m---> 56\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopenpyxl\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mworkbook\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Workbook\n\u001b[1;32m 58\u001b[0m engine_kwargs \u001b[38;5;241m=\u001b[39m combine_kwargs(engine_kwargs, kwargs)\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(\n\u001b[1;32m 61\u001b[0m path,\n\u001b[1;32m 62\u001b[0m mode\u001b[38;5;241m=\u001b[39mmode,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 65\u001b[0m engine_kwargs\u001b[38;5;241m=\u001b[39mengine_kwargs,\n\u001b[1;32m 66\u001b[0m )\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'openpyxl'" + ] + } + ], + "source": [ + "frames = []\n", + "loc_table = []\n", + "for page in pdf.pages:\n", + " situation_loc_line = [l for l in page.extract_text().split(\"\\n\") if \"SITUATION DES LOCATAIRES CRG\" in l]\n", + " if situation_loc_line:\n", + " mois, annee = situation_loc_line[0].split(\" \")[-2:]\n", + " if loc_table:\n", + " loc_table += page.extract_table()[1:]\n", + " else:\n", + " loc_table = page.extract_table()\n", + "\n", + " if \"HONORAIRES\" in page.extract_text():\n", + " df_charge = extract_charge(page.extract_table(charge_table_settings))\n", + "\n", + " df_charge.to_excel(xls_charge, sheet_name=\"Charges\", index=False)\n", + "\n", + "\n", + "df_loc = extract_situation_loc(loc_table)\n", + "df_loc = df_loc.assign(\n", + " mois = mois,\n", + " annee = annee\n", + ")\n", + "df_loc.to_excel(xls_locataire, sheet_name=\"Location\", index=False)\n", + "#df_loc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2e22a94", + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dad54ca3", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/pdf_auralia/__init__.py b/pdf_auralia/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pdf_auralia/extract.py b/pdf_auralia/extract.py new file mode 100644 index 0000000..48710bf --- /dev/null +++ b/pdf_auralia/extract.py @@ -0,0 +1,116 @@ +import pandas as pd +import numpy as np +import pdfplumber +from pathlib import Path +import click + + +def extract_situation_loc(table): + try: + df = pd.DataFrame(table[1:], columns=table[0]) + except IndexError: + print(table) + rows = [] + for i, row in df[df["Locataires"] == "Totaux"].iterrows(): + above_row_loc = df.iloc[i - 1]["Locataires"] + up_row = pd.concat( + [ + row, + parse_above_loc(above_row_loc), + ] + ) + + rows.append(up_row) + df_cleaned = pd.concat(rows, axis=1).T + df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True) + return df_cleaned + + +def parse_above_loc(content): + row = {} + try: + app, loc = content.split("\n") + except ValueError: + row["lot"] = "" + row["type"] = "" + row["locataire"] = content + + else: + app_ = app.split(" ") + row["lot"] = app_[1] + row["type"] = " ".join(app_[2:]) + row["locataire"] = loc + return pd.Series(row) + + +def extract_charge(table): + df = ( + pd.DataFrame(table[1:], columns=table[0]) + .replace("", np.nan) + .dropna(subset=["Débits"]) + ) + drop_index = df[ + df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False) or + df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False) + ].index + df.drop(drop_index, inplace=True) + return df + + +charge_table_settings = { + "vertical_strategy": "lines", + "horizontal_strategy": "text", +} + + +def extract_from_pdf(pdf, charge_dest, location_dest): + loc_table = [] + for page in pdf.pages: + situation_loc_line = [ + l + for l in page.extract_text().split("\n") + if "SITUATION DES LOCATAIRES" in l + ] + if situation_loc_line: + mois, annee = situation_loc_line[0].split(" ")[-2:] + if loc_table: + loc_table += page.extract_table()[1:] + else: + loc_table = page.extract_table() + + if "HONORAIRES" in page.extract_text(): + df_charge = extract_charge(page.extract_table(charge_table_settings)) + df_charge.to_excel(charge_dest, sheet_name="Charges", index=False) + print(f"{charge_dest} saved") + + df_loc = extract_situation_loc(loc_table) + df_loc = df_loc.assign(mois=mois, annee=annee) + df_loc.to_excel(location_dest, sheet_name="Location", index=False) + print(f"{location_dest} saved") + + +def extract_save(pdf_file): + pdf_file = Path(pdf_file) + xls_charge = f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx" + xls_locataire = f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx" + + pdf = pdfplumber.open(pdf_file) + extract_from_pdf(pdf, xls_charge, xls_locataire) + +@click.command() +@click.option("--pdf_file", help="Nom du fichier pdf", default="") +@click.option("--folder", help="Tous les fichiers dans folder", default="./") +def pdf2xlsx(pdf_file, folder): + if pdf_file: + extract_save(pdf_file) + else: + p = Path(folder) + pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)] + for pdf_file in pdf_files: + extract_save(pdf_file) + + + + +if __name__ == "__main__": + pdf2xlsx() diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..2ddd539 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,8 @@ +package = [] + +[metadata] +lock-version = "1.1" +python-versions = "^3.10" +content-hash = "53f2eabc9c26446fbcc00d348c47878e118afc2054778c3c803a0a8028af27d9" + +[metadata.files] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..038da02 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,15 @@ +[tool.poetry] +name = "pdf-auralia" +version = "0.1.0" +description = "" +authors = ["Bertrand Benjamin "] +readme = "README.md" +packages = [{include = "pdf_auralia"}] + +[tool.poetry.dependencies] +python = "^3.10" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..15545b8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,76 @@ +argon2-cffi==21.3.0 +argon2-cffi-bindings==21.2.0 +asttokens==2.0.8 +attrs==22.1.0 +backcall==0.2.0 +beautifulsoup4==4.11.1 +bleach==5.0.1 +cffi==1.15.1 +charset-normalizer==2.1.1 +cryptography==38.0.1 +debugpy==1.6.3 +decorator==5.1.1 +defusedxml==0.7.1 +entrypoints==0.4 +et-xmlfile==1.1.0 +executing==1.1.0 +fastjsonschema==2.16.2 +ipykernel==6.16.0 +ipython==8.5.0 +ipython-genutils==0.2.0 +ipywidgets==8.0.2 +jedi==0.18.1 +Jinja2==3.1.2 +jsonschema==4.16.0 +jupyter==1.0.0 +jupyter-console==6.4.4 +jupyter-core==4.11.1 +jupyter_client==7.3.5 +jupyterlab-pygments==0.2.2 +jupyterlab-widgets==3.0.3 +lxml==4.9.1 +MarkupSafe==2.1.1 +matplotlib-inline==0.1.6 +mistune==2.0.4 +nbclient==0.6.8 +nbconvert==7.0.0 +nbformat==5.6.1 +nest-asyncio==1.5.5 +notebook==6.4.12 +numpy==1.23.3 +openpyxl==3.0.10 +packaging==21.3 +pandas==1.5.0 +pandocfilters==1.5.0 +parso==0.8.3 +pdfminer.six==20220524 +pdfplumber==0.7.4 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==9.2.0 +prometheus-client==0.14.1 +prompt-toolkit==3.0.31 +psutil==5.9.2 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pycparser==2.21 +Pygments==2.13.0 +pyparsing==3.0.9 +pyrsistent==0.18.1 +python-dateutil==2.8.2 +pytz==2022.2.1 +pyzmq==24.0.1 +qtconsole==5.3.2 +QtPy==2.2.0 +Send2Trash==1.8.0 +six==1.16.0 +soupsieve==2.3.2.post1 +stack-data==0.5.1 +terminado==0.15.0 +tinycss2==1.1.1 +tornado==6.2 +traitlets==5.4.0 +Wand==0.6.10 +wcwidth==0.2.5 +webencodings==0.5.1 +widgetsnbextension==4.0.3 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29