pdf_auralia/Extract pdf.ipynb

680 lines
28 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "e24ca74b",
"metadata": {},
"source": [
"# Extraction des informations pour Oralia"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1ac85f0c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pdfplumber\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b246985",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b80265f1",
"metadata": {},
"outputs": [],
"source": [
"pdf_file = Path(\"./pdfs/2022 04 Servient.pdf\")\n",
"pdf = pdfplumber.open(pdf_file)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a62448d8",
"metadata": {},
"outputs": [],
"source": [
"xls_charge = f\"{pdf_file.stem.replace(' ', '_')}_charge.xlsx\"\n",
"xls_locataire = f\"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx\""
]
},
{
"cell_type": "markdown",
"id": "1f503cf5",
"metadata": {},
"source": [
"## Page 1: Récapitulatif"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ae9eb950",
"metadata": {},
"outputs": [],
"source": [
"p1 = pdf.pages[0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "32ef66d7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"SITUATION DES LOCATAIRES\" in p1.extract_text()"
]
},
{
"cell_type": "markdown",
"id": "50dd9c09",
"metadata": {},
"source": [
"## Situation des locataires"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e9c0aefd",
"metadata": {},
"outputs": [],
"source": [
"def extract_situation_loc(table):\n",
" df = pd.DataFrame(table[1:], columns=table[0])\n",
" rows = []\n",
" for i, row in df[df[\"Locataires\"]==\"Totaux\"].iterrows():\n",
" above_row_loc = df.iloc[i-1][\"Locataires\"]\n",
" up_row = pd.concat([row, \n",
" parse_above_loc(above_row_loc),\n",
" ])\n",
"\n",
" rows.append(up_row)\n",
" df_cleaned = pd.concat(rows, axis=1).T\n",
" df_cleaned.drop([\"Locataires\", \"\", \"Période\"], axis=1, inplace=True)\n",
" return df_cleaned\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "87e05f50",
"metadata": {},
"outputs": [],
"source": [
"def parse_above_loc(content):\n",
" row = {}\n",
" try:\n",
" app, loc = content.split(\"\\n\")\n",
" except ValueError:\n",
" row[\"lot\"] = \"\"\n",
" row[\"type\"] = \"\"\n",
" row[\"locataire\"] = content\n",
" \n",
" else:\n",
" app_ = app.split(\" \")\n",
" row[\"lot\"] = app_[1]\n",
" row[\"type\"] = \" \".join(app_[2:])\n",
" row[\"locataire\"] = loc\n",
" return pd.Series(row)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19a1446d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 8,
"id": "8afb23c7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Loyers</th>\n",
" <th>Taxes</th>\n",
" <th>Provisions</th>\n",
" <th>Divers</th>\n",
" <th>Total</th>\n",
" <th>Réglés</th>\n",
" <th>Impayés</th>\n",
" <th>lot</th>\n",
" <th>type</th>\n",
" <th>locataire</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>342.80</td>\n",
" <td>0.00</td>\n",
" <td>663.00</td>\n",
" <td>0.00</td>\n",
" <td>1005.80</td>\n",
" <td>1005.80</td>\n",
" <td></td>\n",
" <td>0001</td>\n",
" <td>Loc. Commercial</td>\n",
" <td>EFFUSION</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3473.79</td>\n",
" <td>0.00</td>\n",
" <td>519.00</td>\n",
" <td>96.79</td>\n",
" <td>4089.58</td>\n",
" <td>4089.58</td>\n",
" <td></td>\n",
" <td>0002</td>\n",
" <td>Loc. Commercial</td>\n",
" <td>RAS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>597.20</td>\n",
" <td>0.00</td>\n",
" <td>31.00</td>\n",
" <td>0.00</td>\n",
" <td>628.60</td>\n",
" <td>628.60</td>\n",
" <td></td>\n",
" <td>0003</td>\n",
" <td>Appartement T1</td>\n",
" <td>KALAI Bernard</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>596.59</td>\n",
" <td>0.00</td>\n",
" <td>31.00</td>\n",
" <td>0.00</td>\n",
" <td>627.59</td>\n",
" <td>627.59</td>\n",
" <td></td>\n",
" <td>0004</td>\n",
" <td>Appartement T2</td>\n",
" <td>PEJAUDIER Adelaide</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>468.85</td>\n",
" <td>0.00</td>\n",
" <td>20.00</td>\n",
" <td>0.00</td>\n",
" <td>981.03</td>\n",
" <td>485.00</td>\n",
" <td>496.03</td>\n",
" <td>0009</td>\n",
" <td>Appartement T1</td>\n",
" <td>MANNA Baptiste</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>745.39</td>\n",
" <td>0.00</td>\n",
" <td>191.00</td>\n",
" <td>0.00</td>\n",
" <td>936.39</td>\n",
" <td>936.39</td>\n",
" <td></td>\n",
" <td>0005</td>\n",
" <td>Loc. Commercial</td>\n",
" <td>ATELIERS RENAISSANCE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>834.55</td>\n",
" <td>0.00</td>\n",
" <td>81.00</td>\n",
" <td>0.00</td>\n",
" <td>915.55</td>\n",
" <td>915.55</td>\n",
" <td></td>\n",
" <td>0006</td>\n",
" <td>Appartement T3</td>\n",
" <td>GUELLIER MURIEL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>591.69</td>\n",
" <td>0.00</td>\n",
" <td>50.00</td>\n",
" <td>0.00</td>\n",
" <td>641.69</td>\n",
" <td>641.69</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>Lot 0007 Appartement T1\\nDOMINIKIEWICZ\\nMELANIE</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>574.71</td>\n",
" <td>0.00</td>\n",
" <td>28.00</td>\n",
" <td>0.00</td>\n",
" <td>602.71</td>\n",
" <td>602.71</td>\n",
" <td></td>\n",
" <td>0008</td>\n",
" <td>Appartement T1</td>\n",
" <td>BESSON Léa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1201.10</td>\n",
" <td>0.00</td>\n",
" <td>87.00</td>\n",
" <td>0.00</td>\n",
" <td>1288.10</td>\n",
" <td>1288.10</td>\n",
" <td></td>\n",
" <td>0010</td>\n",
" <td>Appartement T3</td>\n",
" <td>FILIPPI Bérengère</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>500.46</td>\n",
" <td>0.00</td>\n",
" <td>28.00</td>\n",
" <td>0.00</td>\n",
" <td>528.46</td>\n",
" <td>528.46</td>\n",
" <td></td>\n",
" <td>0011</td>\n",
" <td>Appartement T1</td>\n",
" <td>LOINE Anaïs</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Loyers Taxes Provisions Divers Total Réglés Impayés lot \\\n",
"0 342.80 0.00 663.00 0.00 1005.80 1005.80 0001 \n",
"1 3473.79 0.00 519.00 96.79 4089.58 4089.58 0002 \n",
"2 597.20 0.00 31.00 0.00 628.60 628.60 0003 \n",
"3 596.59 0.00 31.00 0.00 627.59 627.59 0004 \n",
"4 468.85 0.00 20.00 0.00 981.03 485.00 496.03 0009 \n",
"5 745.39 0.00 191.00 0.00 936.39 936.39 0005 \n",
"6 834.55 0.00 81.00 0.00 915.55 915.55 0006 \n",
"7 591.69 0.00 50.00 0.00 641.69 641.69 \n",
"8 574.71 0.00 28.00 0.00 602.71 602.71 0008 \n",
"9 1201.10 0.00 87.00 0.00 1288.10 1288.10 0010 \n",
"10 500.46 0.00 28.00 0.00 528.46 528.46 0011 \n",
"\n",
" type locataire \n",
"0 Loc. Commercial EFFUSION \n",
"1 Loc. Commercial RAS \n",
"2 Appartement T1 KALAI Bernard \n",
"3 Appartement T2 PEJAUDIER Adelaide \n",
"4 Appartement T1 MANNA Baptiste \n",
"5 Loc. Commercial ATELIERS RENAISSANCE \n",
"6 Appartement T3 GUELLIER MURIEL \n",
"7 Lot 0007 Appartement T1\\nDOMINIKIEWICZ\\nMELANIE \n",
"8 Appartement T1 BESSON Léa \n",
"9 Appartement T3 FILIPPI Bérengère \n",
"10 Appartement T1 LOINE Anaïs "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"p2 = pdf.pages[1]\n",
"extract_situation_loc(p2.extract_table())"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "0e0ddca7",
"metadata": {},
"outputs": [],
"source": [
"charge_table_settings = {\n",
" \"vertical_strategy\": \"lines\",\n",
" \"horizontal_strategy\": \"text\",\n",
"}\n",
"def extract_charge(table):\n",
" df = pd.DataFrame(table[1:], columns=table[0]).replace(\"\", np.nan).dropna(subset=[\"Débits\"])\n",
" drop_index = df[df[\"RECAPITULATIF DES OPERATIONS\"].str.contains(\"TOTAUX\", case=False)].index\n",
" df.drop(drop_index, inplace=True)\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "b915b220",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>RECAPITULATIF DES OPERATIONS</th>\n",
" <th>Débits</th>\n",
" <th>Crédits</th>\n",
" <th>Dont T.V.A.</th>\n",
" <th>Locatif</th>\n",
" <th>Déductible</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>DIDIER NETTOYAGE</td>\n",
" <td>PC - ENTRETIEN IMMEUBLE</td>\n",
" <td>708.58</td>\n",
" <td>NaN</td>\n",
" <td>118.10</td>\n",
" <td>708.58</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>TOTAL DIRECT ENERGIE</td>\n",
" <td>PC TOTAL DIRECT ENERGIE</td>\n",
" <td>65.70</td>\n",
" <td>NaN</td>\n",
" <td>7.03</td>\n",
" <td>65.70</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>EDF</td>\n",
" <td>PC EDF DU 17.04.2022</td>\n",
" <td>89.56</td>\n",
" <td>NaN</td>\n",
" <td>10.22</td>\n",
" <td>89.56</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>PICARD SERVICE</td>\n",
" <td>Facture du 11/04/2022</td>\n",
" <td>66.76</td>\n",
" <td>NaN</td>\n",
" <td>6.07</td>\n",
" <td>66.76</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>V2C MAINTENANCE</td>\n",
" <td>6 - remplacement circulateur chudière</td>\n",
" <td>447.70</td>\n",
" <td>NaN</td>\n",
" <td>40.70</td>\n",
" <td>NaN</td>\n",
" <td>447.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>IMI GERANCE</td>\n",
" <td>TAVARES NORTE Dylan</td>\n",
" <td>93.00</td>\n",
" <td>NaN</td>\n",
" <td>15.50</td>\n",
" <td>NaN</td>\n",
" <td>93.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>IMI GERANCE</td>\n",
" <td>TAVARES NORTE Dylan</td>\n",
" <td>173.58</td>\n",
" <td>NaN</td>\n",
" <td>28.93</td>\n",
" <td>NaN</td>\n",
" <td>173.58</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>IMI GERANCE</td>\n",
" <td>TAVARES NORTE Dylan</td>\n",
" <td>798.72</td>\n",
" <td>NaN</td>\n",
" <td>133.12</td>\n",
" <td>NaN</td>\n",
" <td>798.72</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>NaN</td>\n",
" <td>Honoraires H.T.</td>\n",
" <td>979.20</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>979.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>NaN</td>\n",
" <td>TVA/Honoraires ( 20.00 % )</td>\n",
" <td>195.84</td>\n",
" <td>NaN</td>\n",
" <td>195.84</td>\n",
" <td>NaN</td>\n",
" <td>195.84</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" RECAPITULATIF DES OPERATIONS Débits \\\n",
"4 DIDIER NETTOYAGE PC - ENTRETIEN IMMEUBLE 708.58 \n",
"6 TOTAL DIRECT ENERGIE PC TOTAL DIRECT ENERGIE 65.70 \n",
"7 EDF PC EDF DU 17.04.2022 89.56 \n",
"9 PICARD SERVICE Facture du 11/04/2022 66.76 \n",
"15 V2C MAINTENANCE 6 - remplacement circulateur chudière 447.70 \n",
"20 IMI GERANCE TAVARES NORTE Dylan 93.00 \n",
"23 IMI GERANCE TAVARES NORTE Dylan 173.58 \n",
"27 IMI GERANCE TAVARES NORTE Dylan 798.72 \n",
"29 NaN Honoraires H.T. 979.20 \n",
"30 NaN TVA/Honoraires ( 20.00 % ) 195.84 \n",
"\n",
" Crédits Dont T.V.A. Locatif Déductible \n",
"4 NaN 118.10 708.58 NaN \n",
"6 NaN 7.03 65.70 NaN \n",
"7 NaN 10.22 89.56 NaN \n",
"9 NaN 6.07 66.76 NaN \n",
"15 NaN 40.70 NaN 447.70 \n",
"20 NaN 15.50 NaN 93.00 \n",
"23 NaN 28.93 NaN 173.58 \n",
"27 NaN 133.12 NaN 798.72 \n",
"29 NaN NaN NaN 979.20 \n",
"30 NaN 195.84 NaN 195.84 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"p4 = pdf.pages[3]\n",
"extract_charge(p4.extract_table(charge_table_settings))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "c7b071fa",
"metadata": {},
"outputs": [],
"source": [
"# im = p4.to_image()\n",
"# im.debug_tablefinder(charge_table_settings)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "ebe2881a",
"metadata": {
"scrolled": false
},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'openpyxl'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn [12], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHONORAIRES\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m page\u001b[38;5;241m.\u001b[39mextract_text():\n\u001b[1;32m 13\u001b[0m df_charge \u001b[38;5;241m=\u001b[39m extract_charge(page\u001b[38;5;241m.\u001b[39mextract_table(charge_table_settings))\n\u001b[0;32m---> 15\u001b[0m df_charge\u001b[38;5;241m.\u001b[39mto_excel(xls_charge, sheet_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCharges\u001b[39m\u001b[38;5;124m\"\u001b[39m, index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 18\u001b[0m df_loc \u001b[38;5;241m=\u001b[39m extract_situation_loc(loc_table)\n\u001b[1;32m 19\u001b[0m df_loc \u001b[38;5;241m=\u001b[39m df_loc\u001b[38;5;241m.\u001b[39massign(\n\u001b[1;32m 20\u001b[0m mois \u001b[38;5;241m=\u001b[39m mois,\n\u001b[1;32m 21\u001b[0m annee \u001b[38;5;241m=\u001b[39m annee\n\u001b[1;32m 22\u001b[0m )\n",
"File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/util/_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 210\u001b[0m kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[0;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/util/_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 210\u001b[0m kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[0;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/core/generic.py:2373\u001b[0m, in \u001b[0;36mNDFrame.to_excel\u001b[0;34m(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, encoding, inf_rep, verbose, freeze_panes, storage_options)\u001b[0m\n\u001b[1;32m 2360\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mformats\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexcel\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ExcelFormatter\n\u001b[1;32m 2362\u001b[0m formatter \u001b[38;5;241m=\u001b[39m ExcelFormatter(\n\u001b[1;32m 2363\u001b[0m df,\n\u001b[1;32m 2364\u001b[0m na_rep\u001b[38;5;241m=\u001b[39mna_rep,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2371\u001b[0m inf_rep\u001b[38;5;241m=\u001b[39minf_rep,\n\u001b[1;32m 2372\u001b[0m )\n\u001b[0;32m-> 2373\u001b[0m \u001b[43mformatter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2374\u001b[0m \u001b[43m \u001b[49m\u001b[43mexcel_writer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2375\u001b[0m \u001b[43m \u001b[49m\u001b[43msheet_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msheet_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2376\u001b[0m \u001b[43m \u001b[49m\u001b[43mstartrow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstartrow\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2377\u001b[0m \u001b[43m \u001b[49m\u001b[43mstartcol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstartcol\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2378\u001b[0m \u001b[43m \u001b[49m\u001b[43mfreeze_panes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfreeze_panes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2379\u001b[0m \u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2380\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2381\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/io/formats/excel.py:911\u001b[0m, in \u001b[0;36mExcelFormatter.write\u001b[0;34m(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options)\u001b[0m\n\u001b[1;32m 907\u001b[0m need_save \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 908\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 909\u001b[0m \u001b[38;5;66;03m# error: Cannot instantiate abstract class 'ExcelWriter' with abstract\u001b[39;00m\n\u001b[1;32m 910\u001b[0m \u001b[38;5;66;03m# attributes 'engine', 'save', 'supported_extensions' and 'write_cells'\u001b[39;00m\n\u001b[0;32m--> 911\u001b[0m writer \u001b[38;5;241m=\u001b[39m \u001b[43mExcelWriter\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[abstract]\u001b[39;49;00m\n\u001b[1;32m 912\u001b[0m \u001b[43m \u001b[49m\u001b[43mwriter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\n\u001b[1;32m 913\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 914\u001b[0m need_save \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 916\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
"File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/io/excel/_openpyxl.py:56\u001b[0m, in \u001b[0;36mOpenpyxlWriter.__init__\u001b[0;34m(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 45\u001b[0m path: FilePath \u001b[38;5;241m|\u001b[39m WriteExcelBuffer \u001b[38;5;241m|\u001b[39m ExcelWriter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 54\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 55\u001b[0m \u001b[38;5;66;03m# Use the openpyxl module as the Excel writer.\u001b[39;00m\n\u001b[0;32m---> 56\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopenpyxl\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mworkbook\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Workbook\n\u001b[1;32m 58\u001b[0m engine_kwargs \u001b[38;5;241m=\u001b[39m combine_kwargs(engine_kwargs, kwargs)\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(\n\u001b[1;32m 61\u001b[0m path,\n\u001b[1;32m 62\u001b[0m mode\u001b[38;5;241m=\u001b[39mmode,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 65\u001b[0m engine_kwargs\u001b[38;5;241m=\u001b[39mengine_kwargs,\n\u001b[1;32m 66\u001b[0m )\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'openpyxl'"
]
}
],
"source": [
"frames = []\n",
"loc_table = []\n",
"for page in pdf.pages:\n",
" situation_loc_line = [l for l in page.extract_text().split(\"\\n\") if \"SITUATION DES LOCATAIRES CRG\" in l]\n",
" if situation_loc_line:\n",
" mois, annee = situation_loc_line[0].split(\" \")[-2:]\n",
" if loc_table:\n",
" loc_table += page.extract_table()[1:]\n",
" else:\n",
" loc_table = page.extract_table()\n",
"\n",
" if \"HONORAIRES\" in page.extract_text():\n",
" df_charge = extract_charge(page.extract_table(charge_table_settings))\n",
"\n",
" df_charge.to_excel(xls_charge, sheet_name=\"Charges\", index=False)\n",
"\n",
"\n",
"df_loc = extract_situation_loc(loc_table)\n",
"df_loc = df_loc.assign(\n",
" mois = mois,\n",
" annee = annee\n",
")\n",
"df_loc.to_excel(xls_locataire, sheet_name=\"Location\", index=False)\n",
"#df_loc"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f2e22a94",
"metadata": {},
"outputs": [],
"source": [
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dad54ca3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}