Feat: init repository
This commit is contained in:
commit
546f4d1949
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
pdfs/
|
679
Extract pdf.ipynb
Normal file
679
Extract pdf.ipynb
Normal file
@ -0,0 +1,679 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "e24ca74b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Extraction des informations pour Oralia"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "1ac85f0c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pdfplumber\n",
|
||||||
|
"from pathlib import Path"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6b246985",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "b80265f1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pdf_file = Path(\"./pdfs/2022 04 Servient.pdf\")\n",
|
||||||
|
"pdf = pdfplumber.open(pdf_file)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "a62448d8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"xls_charge = f\"{pdf_file.stem.replace(' ', '_')}_charge.xlsx\"\n",
|
||||||
|
"xls_locataire = f\"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1f503cf5",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Page 1: Récapitulatif"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "ae9eb950",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"p1 = pdf.pages[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "32ef66d7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"False"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"\"SITUATION DES LOCATAIRES\" in p1.extract_text()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "50dd9c09",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Situation des locataires"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "e9c0aefd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def extract_situation_loc(table):\n",
|
||||||
|
" df = pd.DataFrame(table[1:], columns=table[0])\n",
|
||||||
|
" rows = []\n",
|
||||||
|
" for i, row in df[df[\"Locataires\"]==\"Totaux\"].iterrows():\n",
|
||||||
|
" above_row_loc = df.iloc[i-1][\"Locataires\"]\n",
|
||||||
|
" up_row = pd.concat([row, \n",
|
||||||
|
" parse_above_loc(above_row_loc),\n",
|
||||||
|
" ])\n",
|
||||||
|
"\n",
|
||||||
|
" rows.append(up_row)\n",
|
||||||
|
" df_cleaned = pd.concat(rows, axis=1).T\n",
|
||||||
|
" df_cleaned.drop([\"Locataires\", \"\", \"Période\"], axis=1, inplace=True)\n",
|
||||||
|
" return df_cleaned\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "87e05f50",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def parse_above_loc(content):\n",
|
||||||
|
" row = {}\n",
|
||||||
|
" try:\n",
|
||||||
|
" app, loc = content.split(\"\\n\")\n",
|
||||||
|
" except ValueError:\n",
|
||||||
|
" row[\"lot\"] = \"\"\n",
|
||||||
|
" row[\"type\"] = \"\"\n",
|
||||||
|
" row[\"locataire\"] = content\n",
|
||||||
|
" \n",
|
||||||
|
" else:\n",
|
||||||
|
" app_ = app.split(\" \")\n",
|
||||||
|
" row[\"lot\"] = app_[1]\n",
|
||||||
|
" row[\"type\"] = \" \".join(app_[2:])\n",
|
||||||
|
" row[\"locataire\"] = loc\n",
|
||||||
|
" return pd.Series(row)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "19a1446d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "8afb23c7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>Loyers</th>\n",
|
||||||
|
" <th>Taxes</th>\n",
|
||||||
|
" <th>Provisions</th>\n",
|
||||||
|
" <th>Divers</th>\n",
|
||||||
|
" <th>Total</th>\n",
|
||||||
|
" <th>Réglés</th>\n",
|
||||||
|
" <th>Impayés</th>\n",
|
||||||
|
" <th>lot</th>\n",
|
||||||
|
" <th>type</th>\n",
|
||||||
|
" <th>locataire</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>342.80</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>663.00</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>1005.80</td>\n",
|
||||||
|
" <td>1005.80</td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td>0001</td>\n",
|
||||||
|
" <td>Loc. Commercial</td>\n",
|
||||||
|
" <td>EFFUSION</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>3473.79</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>519.00</td>\n",
|
||||||
|
" <td>96.79</td>\n",
|
||||||
|
" <td>4089.58</td>\n",
|
||||||
|
" <td>4089.58</td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td>0002</td>\n",
|
||||||
|
" <td>Loc. Commercial</td>\n",
|
||||||
|
" <td>RAS</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>597.20</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>31.00</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>628.60</td>\n",
|
||||||
|
" <td>628.60</td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td>0003</td>\n",
|
||||||
|
" <td>Appartement T1</td>\n",
|
||||||
|
" <td>KALAI Bernard</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>596.59</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>31.00</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>627.59</td>\n",
|
||||||
|
" <td>627.59</td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td>0004</td>\n",
|
||||||
|
" <td>Appartement T2</td>\n",
|
||||||
|
" <td>PEJAUDIER Adelaide</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>468.85</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>20.00</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>981.03</td>\n",
|
||||||
|
" <td>485.00</td>\n",
|
||||||
|
" <td>496.03</td>\n",
|
||||||
|
" <td>0009</td>\n",
|
||||||
|
" <td>Appartement T1</td>\n",
|
||||||
|
" <td>MANNA Baptiste</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5</th>\n",
|
||||||
|
" <td>745.39</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>191.00</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>936.39</td>\n",
|
||||||
|
" <td>936.39</td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td>0005</td>\n",
|
||||||
|
" <td>Loc. Commercial</td>\n",
|
||||||
|
" <td>ATELIERS RENAISSANCE</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6</th>\n",
|
||||||
|
" <td>834.55</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>81.00</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>915.55</td>\n",
|
||||||
|
" <td>915.55</td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td>0006</td>\n",
|
||||||
|
" <td>Appartement T3</td>\n",
|
||||||
|
" <td>GUELLIER MURIEL</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>7</th>\n",
|
||||||
|
" <td>591.69</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>50.00</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>641.69</td>\n",
|
||||||
|
" <td>641.69</td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td>Lot 0007 Appartement T1\\nDOMINIKIEWICZ\\nMELANIE</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>8</th>\n",
|
||||||
|
" <td>574.71</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>28.00</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>602.71</td>\n",
|
||||||
|
" <td>602.71</td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td>0008</td>\n",
|
||||||
|
" <td>Appartement T1</td>\n",
|
||||||
|
" <td>BESSON Léa</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>9</th>\n",
|
||||||
|
" <td>1201.10</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>87.00</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>1288.10</td>\n",
|
||||||
|
" <td>1288.10</td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td>0010</td>\n",
|
||||||
|
" <td>Appartement T3</td>\n",
|
||||||
|
" <td>FILIPPI Bérengère</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>10</th>\n",
|
||||||
|
" <td>500.46</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>28.00</td>\n",
|
||||||
|
" <td>0.00</td>\n",
|
||||||
|
" <td>528.46</td>\n",
|
||||||
|
" <td>528.46</td>\n",
|
||||||
|
" <td></td>\n",
|
||||||
|
" <td>0011</td>\n",
|
||||||
|
" <td>Appartement T1</td>\n",
|
||||||
|
" <td>LOINE Anaïs</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" Loyers Taxes Provisions Divers Total Réglés Impayés lot \\\n",
|
||||||
|
"0 342.80 0.00 663.00 0.00 1005.80 1005.80 0001 \n",
|
||||||
|
"1 3473.79 0.00 519.00 96.79 4089.58 4089.58 0002 \n",
|
||||||
|
"2 597.20 0.00 31.00 0.00 628.60 628.60 0003 \n",
|
||||||
|
"3 596.59 0.00 31.00 0.00 627.59 627.59 0004 \n",
|
||||||
|
"4 468.85 0.00 20.00 0.00 981.03 485.00 496.03 0009 \n",
|
||||||
|
"5 745.39 0.00 191.00 0.00 936.39 936.39 0005 \n",
|
||||||
|
"6 834.55 0.00 81.00 0.00 915.55 915.55 0006 \n",
|
||||||
|
"7 591.69 0.00 50.00 0.00 641.69 641.69 \n",
|
||||||
|
"8 574.71 0.00 28.00 0.00 602.71 602.71 0008 \n",
|
||||||
|
"9 1201.10 0.00 87.00 0.00 1288.10 1288.10 0010 \n",
|
||||||
|
"10 500.46 0.00 28.00 0.00 528.46 528.46 0011 \n",
|
||||||
|
"\n",
|
||||||
|
" type locataire \n",
|
||||||
|
"0 Loc. Commercial EFFUSION \n",
|
||||||
|
"1 Loc. Commercial RAS \n",
|
||||||
|
"2 Appartement T1 KALAI Bernard \n",
|
||||||
|
"3 Appartement T2 PEJAUDIER Adelaide \n",
|
||||||
|
"4 Appartement T1 MANNA Baptiste \n",
|
||||||
|
"5 Loc. Commercial ATELIERS RENAISSANCE \n",
|
||||||
|
"6 Appartement T3 GUELLIER MURIEL \n",
|
||||||
|
"7 Lot 0007 Appartement T1\\nDOMINIKIEWICZ\\nMELANIE \n",
|
||||||
|
"8 Appartement T1 BESSON Léa \n",
|
||||||
|
"9 Appartement T3 FILIPPI Bérengère \n",
|
||||||
|
"10 Appartement T1 LOINE Anaïs "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"p2 = pdf.pages[1]\n",
|
||||||
|
"extract_situation_loc(p2.extract_table())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "0e0ddca7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"charge_table_settings = {\n",
|
||||||
|
" \"vertical_strategy\": \"lines\",\n",
|
||||||
|
" \"horizontal_strategy\": \"text\",\n",
|
||||||
|
"}\n",
|
||||||
|
"def extract_charge(table):\n",
|
||||||
|
" df = pd.DataFrame(table[1:], columns=table[0]).replace(\"\", np.nan).dropna(subset=[\"Débits\"])\n",
|
||||||
|
" drop_index = df[df[\"RECAPITULATIF DES OPERATIONS\"].str.contains(\"TOTAUX\", case=False)].index\n",
|
||||||
|
" df.drop(drop_index, inplace=True)\n",
|
||||||
|
" return df"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "b915b220",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>RECAPITULATIF DES OPERATIONS</th>\n",
|
||||||
|
" <th>Débits</th>\n",
|
||||||
|
" <th>Crédits</th>\n",
|
||||||
|
" <th>Dont T.V.A.</th>\n",
|
||||||
|
" <th>Locatif</th>\n",
|
||||||
|
" <th>Déductible</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>DIDIER NETTOYAGE</td>\n",
|
||||||
|
" <td>PC - ENTRETIEN IMMEUBLE</td>\n",
|
||||||
|
" <td>708.58</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>118.10</td>\n",
|
||||||
|
" <td>708.58</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6</th>\n",
|
||||||
|
" <td>TOTAL DIRECT ENERGIE</td>\n",
|
||||||
|
" <td>PC TOTAL DIRECT ENERGIE</td>\n",
|
||||||
|
" <td>65.70</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>7.03</td>\n",
|
||||||
|
" <td>65.70</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>7</th>\n",
|
||||||
|
" <td>EDF</td>\n",
|
||||||
|
" <td>PC EDF DU 17.04.2022</td>\n",
|
||||||
|
" <td>89.56</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>10.22</td>\n",
|
||||||
|
" <td>89.56</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>9</th>\n",
|
||||||
|
" <td>PICARD SERVICE</td>\n",
|
||||||
|
" <td>Facture du 11/04/2022</td>\n",
|
||||||
|
" <td>66.76</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>6.07</td>\n",
|
||||||
|
" <td>66.76</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>15</th>\n",
|
||||||
|
" <td>V2C MAINTENANCE</td>\n",
|
||||||
|
" <td>6 - remplacement circulateur chudière</td>\n",
|
||||||
|
" <td>447.70</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>40.70</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>447.70</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>20</th>\n",
|
||||||
|
" <td>IMI GERANCE</td>\n",
|
||||||
|
" <td>TAVARES NORTE Dylan</td>\n",
|
||||||
|
" <td>93.00</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>15.50</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>93.00</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>23</th>\n",
|
||||||
|
" <td>IMI GERANCE</td>\n",
|
||||||
|
" <td>TAVARES NORTE Dylan</td>\n",
|
||||||
|
" <td>173.58</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>28.93</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>173.58</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>27</th>\n",
|
||||||
|
" <td>IMI GERANCE</td>\n",
|
||||||
|
" <td>TAVARES NORTE Dylan</td>\n",
|
||||||
|
" <td>798.72</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>133.12</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>798.72</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>29</th>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>Honoraires H.T.</td>\n",
|
||||||
|
" <td>979.20</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>979.20</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>30</th>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>TVA/Honoraires ( 20.00 % )</td>\n",
|
||||||
|
" <td>195.84</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>195.84</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>195.84</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" RECAPITULATIF DES OPERATIONS Débits \\\n",
|
||||||
|
"4 DIDIER NETTOYAGE PC - ENTRETIEN IMMEUBLE 708.58 \n",
|
||||||
|
"6 TOTAL DIRECT ENERGIE PC TOTAL DIRECT ENERGIE 65.70 \n",
|
||||||
|
"7 EDF PC EDF DU 17.04.2022 89.56 \n",
|
||||||
|
"9 PICARD SERVICE Facture du 11/04/2022 66.76 \n",
|
||||||
|
"15 V2C MAINTENANCE 6 - remplacement circulateur chudière 447.70 \n",
|
||||||
|
"20 IMI GERANCE TAVARES NORTE Dylan 93.00 \n",
|
||||||
|
"23 IMI GERANCE TAVARES NORTE Dylan 173.58 \n",
|
||||||
|
"27 IMI GERANCE TAVARES NORTE Dylan 798.72 \n",
|
||||||
|
"29 NaN Honoraires H.T. 979.20 \n",
|
||||||
|
"30 NaN TVA/Honoraires ( 20.00 % ) 195.84 \n",
|
||||||
|
"\n",
|
||||||
|
" Crédits Dont T.V.A. Locatif Déductible \n",
|
||||||
|
"4 NaN 118.10 708.58 NaN \n",
|
||||||
|
"6 NaN 7.03 65.70 NaN \n",
|
||||||
|
"7 NaN 10.22 89.56 NaN \n",
|
||||||
|
"9 NaN 6.07 66.76 NaN \n",
|
||||||
|
"15 NaN 40.70 NaN 447.70 \n",
|
||||||
|
"20 NaN 15.50 NaN 93.00 \n",
|
||||||
|
"23 NaN 28.93 NaN 173.58 \n",
|
||||||
|
"27 NaN 133.12 NaN 798.72 \n",
|
||||||
|
"29 NaN NaN NaN 979.20 \n",
|
||||||
|
"30 NaN 195.84 NaN 195.84 "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"p4 = pdf.pages[3]\n",
|
||||||
|
"extract_charge(p4.extract_table(charge_table_settings))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "c7b071fa",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# im = p4.to_image()\n",
|
||||||
|
"# im.debug_tablefinder(charge_table_settings)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "ebe2881a",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "ModuleNotFoundError",
|
||||||
|
"evalue": "No module named 'openpyxl'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001b[0;32mIn [12], line 15\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mHONORAIRES\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m page\u001b[38;5;241m.\u001b[39mextract_text():\n\u001b[1;32m 13\u001b[0m df_charge \u001b[38;5;241m=\u001b[39m extract_charge(page\u001b[38;5;241m.\u001b[39mextract_table(charge_table_settings))\n\u001b[0;32m---> 15\u001b[0m df_charge\u001b[38;5;241m.\u001b[39mto_excel(xls_charge, sheet_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCharges\u001b[39m\u001b[38;5;124m\"\u001b[39m, index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 18\u001b[0m df_loc \u001b[38;5;241m=\u001b[39m extract_situation_loc(loc_table)\n\u001b[1;32m 19\u001b[0m df_loc \u001b[38;5;241m=\u001b[39m df_loc\u001b[38;5;241m.\u001b[39massign(\n\u001b[1;32m 20\u001b[0m mois \u001b[38;5;241m=\u001b[39m mois,\n\u001b[1;32m 21\u001b[0m annee \u001b[38;5;241m=\u001b[39m annee\n\u001b[1;32m 22\u001b[0m )\n",
|
||||||
|
"File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/util/_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 210\u001b[0m kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[0;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
|
"File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/util/_decorators.py:211\u001b[0m, in \u001b[0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 210\u001b[0m kwargs[new_arg_name] \u001b[38;5;241m=\u001b[39m new_arg_value\n\u001b[0;32m--> 211\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
|
"File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/core/generic.py:2373\u001b[0m, in \u001b[0;36mNDFrame.to_excel\u001b[0;34m(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, encoding, inf_rep, verbose, freeze_panes, storage_options)\u001b[0m\n\u001b[1;32m 2360\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mformats\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexcel\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ExcelFormatter\n\u001b[1;32m 2362\u001b[0m formatter \u001b[38;5;241m=\u001b[39m ExcelFormatter(\n\u001b[1;32m 2363\u001b[0m df,\n\u001b[1;32m 2364\u001b[0m na_rep\u001b[38;5;241m=\u001b[39mna_rep,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2371\u001b[0m inf_rep\u001b[38;5;241m=\u001b[39minf_rep,\n\u001b[1;32m 2372\u001b[0m )\n\u001b[0;32m-> 2373\u001b[0m \u001b[43mformatter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2374\u001b[0m \u001b[43m \u001b[49m\u001b[43mexcel_writer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2375\u001b[0m \u001b[43m \u001b[49m\u001b[43msheet_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msheet_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2376\u001b[0m \u001b[43m \u001b[49m\u001b[43mstartrow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstartrow\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2377\u001b[0m \u001b[43m \u001b[49m\u001b[43mstartcol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstartcol\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2378\u001b[0m \u001b[43m \u001b[49m\u001b[43mfreeze_panes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfreeze_panes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2379\u001b[0m \u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2380\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2381\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
|
"File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/io/formats/excel.py:911\u001b[0m, in \u001b[0;36mExcelFormatter.write\u001b[0;34m(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options)\u001b[0m\n\u001b[1;32m 907\u001b[0m need_save \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 908\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 909\u001b[0m \u001b[38;5;66;03m# error: Cannot instantiate abstract class 'ExcelWriter' with abstract\u001b[39;00m\n\u001b[1;32m 910\u001b[0m \u001b[38;5;66;03m# attributes 'engine', 'save', 'supported_extensions' and 'write_cells'\u001b[39;00m\n\u001b[0;32m--> 911\u001b[0m writer \u001b[38;5;241m=\u001b[39m \u001b[43mExcelWriter\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[abstract]\u001b[39;49;00m\n\u001b[1;32m 912\u001b[0m \u001b[43m \u001b[49m\u001b[43mwriter\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\n\u001b[1;32m 913\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 914\u001b[0m need_save \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 916\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
|
||||||
|
"File \u001b[0;32m~/.venv/plesna/lib/python3.10/site-packages/pandas/io/excel/_openpyxl.py:56\u001b[0m, in \u001b[0;36mOpenpyxlWriter.__init__\u001b[0;34m(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 45\u001b[0m path: FilePath \u001b[38;5;241m|\u001b[39m WriteExcelBuffer \u001b[38;5;241m|\u001b[39m ExcelWriter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 54\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 55\u001b[0m \u001b[38;5;66;03m# Use the openpyxl module as the Excel writer.\u001b[39;00m\n\u001b[0;32m---> 56\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopenpyxl\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mworkbook\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Workbook\n\u001b[1;32m 58\u001b[0m engine_kwargs \u001b[38;5;241m=\u001b[39m combine_kwargs(engine_kwargs, kwargs)\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__init__\u001b[39m(\n\u001b[1;32m 61\u001b[0m path,\n\u001b[1;32m 62\u001b[0m mode\u001b[38;5;241m=\u001b[39mmode,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 65\u001b[0m engine_kwargs\u001b[38;5;241m=\u001b[39mengine_kwargs,\n\u001b[1;32m 66\u001b[0m )\n",
|
||||||
|
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'openpyxl'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"frames = []\n",
|
||||||
|
"loc_table = []\n",
|
||||||
|
"for page in pdf.pages:\n",
|
||||||
|
" situation_loc_line = [l for l in page.extract_text().split(\"\\n\") if \"SITUATION DES LOCATAIRES CRG\" in l]\n",
|
||||||
|
" if situation_loc_line:\n",
|
||||||
|
" mois, annee = situation_loc_line[0].split(\" \")[-2:]\n",
|
||||||
|
" if loc_table:\n",
|
||||||
|
" loc_table += page.extract_table()[1:]\n",
|
||||||
|
" else:\n",
|
||||||
|
" loc_table = page.extract_table()\n",
|
||||||
|
"\n",
|
||||||
|
" if \"HONORAIRES\" in page.extract_text():\n",
|
||||||
|
" df_charge = extract_charge(page.extract_table(charge_table_settings))\n",
|
||||||
|
"\n",
|
||||||
|
" df_charge.to_excel(xls_charge, sheet_name=\"Charges\", index=False)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"df_loc = extract_situation_loc(loc_table)\n",
|
||||||
|
"df_loc = df_loc.assign(\n",
|
||||||
|
" mois = mois,\n",
|
||||||
|
" annee = annee\n",
|
||||||
|
")\n",
|
||||||
|
"df_loc.to_excel(xls_locataire, sheet_name=\"Location\", index=False)\n",
|
||||||
|
"#df_loc"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f2e22a94",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "dad54ca3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
0
pdf_auralia/__init__.py
Normal file
0
pdf_auralia/__init__.py
Normal file
116
pdf_auralia/extract.py
Normal file
116
pdf_auralia/extract.py
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import pdfplumber
|
||||||
|
from pathlib import Path
|
||||||
|
import click
|
||||||
|
|
||||||
|
|
||||||
|
def extract_situation_loc(table):
|
||||||
|
try:
|
||||||
|
df = pd.DataFrame(table[1:], columns=table[0])
|
||||||
|
except IndexError:
|
||||||
|
print(table)
|
||||||
|
rows = []
|
||||||
|
for i, row in df[df["Locataires"] == "Totaux"].iterrows():
|
||||||
|
above_row_loc = df.iloc[i - 1]["Locataires"]
|
||||||
|
up_row = pd.concat(
|
||||||
|
[
|
||||||
|
row,
|
||||||
|
parse_above_loc(above_row_loc),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
rows.append(up_row)
|
||||||
|
df_cleaned = pd.concat(rows, axis=1).T
|
||||||
|
df_cleaned.drop(["Locataires", "", "Période"], axis=1, inplace=True)
|
||||||
|
return df_cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def parse_above_loc(content):
|
||||||
|
row = {}
|
||||||
|
try:
|
||||||
|
app, loc = content.split("\n")
|
||||||
|
except ValueError:
|
||||||
|
row["lot"] = ""
|
||||||
|
row["type"] = ""
|
||||||
|
row["locataire"] = content
|
||||||
|
|
||||||
|
else:
|
||||||
|
app_ = app.split(" ")
|
||||||
|
row["lot"] = app_[1]
|
||||||
|
row["type"] = " ".join(app_[2:])
|
||||||
|
row["locataire"] = loc
|
||||||
|
return pd.Series(row)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_charge(table):
|
||||||
|
df = (
|
||||||
|
pd.DataFrame(table[1:], columns=table[0])
|
||||||
|
.replace("", np.nan)
|
||||||
|
.dropna(subset=["Débits"])
|
||||||
|
)
|
||||||
|
drop_index = df[
|
||||||
|
df["RECAPITULATIF DES OPERATIONS"].str.contains("TOTAUX", case=False) or
|
||||||
|
df["RECAPITULATIF DES OPERATIONS"].str.contains("solde", case=False)
|
||||||
|
].index
|
||||||
|
df.drop(drop_index, inplace=True)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
charge_table_settings = {
|
||||||
|
"vertical_strategy": "lines",
|
||||||
|
"horizontal_strategy": "text",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_from_pdf(pdf, charge_dest, location_dest):
|
||||||
|
loc_table = []
|
||||||
|
for page in pdf.pages:
|
||||||
|
situation_loc_line = [
|
||||||
|
l
|
||||||
|
for l in page.extract_text().split("\n")
|
||||||
|
if "SITUATION DES LOCATAIRES" in l
|
||||||
|
]
|
||||||
|
if situation_loc_line:
|
||||||
|
mois, annee = situation_loc_line[0].split(" ")[-2:]
|
||||||
|
if loc_table:
|
||||||
|
loc_table += page.extract_table()[1:]
|
||||||
|
else:
|
||||||
|
loc_table = page.extract_table()
|
||||||
|
|
||||||
|
if "HONORAIRES" in page.extract_text():
|
||||||
|
df_charge = extract_charge(page.extract_table(charge_table_settings))
|
||||||
|
df_charge.to_excel(charge_dest, sheet_name="Charges", index=False)
|
||||||
|
print(f"{charge_dest} saved")
|
||||||
|
|
||||||
|
df_loc = extract_situation_loc(loc_table)
|
||||||
|
df_loc = df_loc.assign(mois=mois, annee=annee)
|
||||||
|
df_loc.to_excel(location_dest, sheet_name="Location", index=False)
|
||||||
|
print(f"{location_dest} saved")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_save(pdf_file):
|
||||||
|
pdf_file = Path(pdf_file)
|
||||||
|
xls_charge = f"{pdf_file.stem.replace(' ', '_')}_charge.xlsx"
|
||||||
|
xls_locataire = f"{pdf_file.stem.replace(' ', '_')}_locataire.xlsx"
|
||||||
|
|
||||||
|
pdf = pdfplumber.open(pdf_file)
|
||||||
|
extract_from_pdf(pdf, xls_charge, xls_locataire)
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option("--pdf_file", help="Nom du fichier pdf", default="")
|
||||||
|
@click.option("--folder", help="Tous les fichiers dans folder", default="./")
|
||||||
|
def pdf2xlsx(pdf_file, folder):
|
||||||
|
if pdf_file:
|
||||||
|
extract_save(pdf_file)
|
||||||
|
else:
|
||||||
|
p = Path(folder)
|
||||||
|
pdf_files = [x for x in p.iterdir() if ".pdf" in str(x)]
|
||||||
|
for pdf_file in pdf_files:
|
||||||
|
extract_save(pdf_file)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pdf2xlsx()
|
8
poetry.lock
generated
Normal file
8
poetry.lock
generated
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
package = []
|
||||||
|
|
||||||
|
[metadata]
|
||||||
|
lock-version = "1.1"
|
||||||
|
python-versions = "^3.10"
|
||||||
|
content-hash = "53f2eabc9c26446fbcc00d348c47878e118afc2054778c3c803a0a8028af27d9"
|
||||||
|
|
||||||
|
[metadata.files]
|
15
pyproject.toml
Normal file
15
pyproject.toml
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
[tool.poetry]
|
||||||
|
name = "pdf-auralia"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = ""
|
||||||
|
authors = ["Bertrand Benjamin <benjamin.bertrand@opytex.org>"]
|
||||||
|
readme = "README.md"
|
||||||
|
packages = [{include = "pdf_auralia"}]
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.10"
|
||||||
|
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
76
requirements.txt
Normal file
76
requirements.txt
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
argon2-cffi==21.3.0
|
||||||
|
argon2-cffi-bindings==21.2.0
|
||||||
|
asttokens==2.0.8
|
||||||
|
attrs==22.1.0
|
||||||
|
backcall==0.2.0
|
||||||
|
beautifulsoup4==4.11.1
|
||||||
|
bleach==5.0.1
|
||||||
|
cffi==1.15.1
|
||||||
|
charset-normalizer==2.1.1
|
||||||
|
cryptography==38.0.1
|
||||||
|
debugpy==1.6.3
|
||||||
|
decorator==5.1.1
|
||||||
|
defusedxml==0.7.1
|
||||||
|
entrypoints==0.4
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
executing==1.1.0
|
||||||
|
fastjsonschema==2.16.2
|
||||||
|
ipykernel==6.16.0
|
||||||
|
ipython==8.5.0
|
||||||
|
ipython-genutils==0.2.0
|
||||||
|
ipywidgets==8.0.2
|
||||||
|
jedi==0.18.1
|
||||||
|
Jinja2==3.1.2
|
||||||
|
jsonschema==4.16.0
|
||||||
|
jupyter==1.0.0
|
||||||
|
jupyter-console==6.4.4
|
||||||
|
jupyter-core==4.11.1
|
||||||
|
jupyter_client==7.3.5
|
||||||
|
jupyterlab-pygments==0.2.2
|
||||||
|
jupyterlab-widgets==3.0.3
|
||||||
|
lxml==4.9.1
|
||||||
|
MarkupSafe==2.1.1
|
||||||
|
matplotlib-inline==0.1.6
|
||||||
|
mistune==2.0.4
|
||||||
|
nbclient==0.6.8
|
||||||
|
nbconvert==7.0.0
|
||||||
|
nbformat==5.6.1
|
||||||
|
nest-asyncio==1.5.5
|
||||||
|
notebook==6.4.12
|
||||||
|
numpy==1.23.3
|
||||||
|
openpyxl==3.0.10
|
||||||
|
packaging==21.3
|
||||||
|
pandas==1.5.0
|
||||||
|
pandocfilters==1.5.0
|
||||||
|
parso==0.8.3
|
||||||
|
pdfminer.six==20220524
|
||||||
|
pdfplumber==0.7.4
|
||||||
|
pexpect==4.8.0
|
||||||
|
pickleshare==0.7.5
|
||||||
|
Pillow==9.2.0
|
||||||
|
prometheus-client==0.14.1
|
||||||
|
prompt-toolkit==3.0.31
|
||||||
|
psutil==5.9.2
|
||||||
|
ptyprocess==0.7.0
|
||||||
|
pure-eval==0.2.2
|
||||||
|
pycparser==2.21
|
||||||
|
Pygments==2.13.0
|
||||||
|
pyparsing==3.0.9
|
||||||
|
pyrsistent==0.18.1
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
pytz==2022.2.1
|
||||||
|
pyzmq==24.0.1
|
||||||
|
qtconsole==5.3.2
|
||||||
|
QtPy==2.2.0
|
||||||
|
Send2Trash==1.8.0
|
||||||
|
six==1.16.0
|
||||||
|
soupsieve==2.3.2.post1
|
||||||
|
stack-data==0.5.1
|
||||||
|
terminado==0.15.0
|
||||||
|
tinycss2==1.1.1
|
||||||
|
tornado==6.2
|
||||||
|
traitlets==5.4.0
|
||||||
|
Wand==0.6.10
|
||||||
|
wcwidth==0.2.5
|
||||||
|
webencodings==0.5.1
|
||||||
|
widgetsnbextension==4.0.3
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
Loading…
Reference in New Issue
Block a user