{ "cells": [ { "cell_type": "markdown", "id": "96263cc4-e4f1-4f42-94cb-14b2d2d35302", "metadata": {}, "source": [ "# Automatic tagging" ] }, { "cell_type": "code", "execution_count": 1, "id": "a6d0b19f-9d89-4260-8662-a0f5683d0ec2", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pathlib import Path\n", "from sklearn.pipeline import Pipeline\n", "from matplotlib import pyplot as plt " ] }, { "cell_type": "markdown", "id": "1585b7a5-d0b9-4781-accd-ee36dddd7bae", "metadata": {}, "source": [ "## Import des données" ] }, { "cell_type": "code", "execution_count": 2, "id": "0751d414-f28e-4e9a-9151-3a1dc1b05f3c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2020.csv'),\n", " PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2018.csv'),\n", " PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2022.csv'),\n", " PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2021.csv'),\n", " PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2023.csv'),\n", " PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2019.csv'),\n", " PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2017.csv')]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "staging_path = Path(\"../PLESNA Compta SYSTEM/staging/CRG/\")\n", "assert staging_path.exists()\n", "files = list(staging_path.glob(\"*.csv\"))\n", "files" ] }, { "cell_type": "code", "execution_count": 3, "id": "f88989ca-968b-4c97-849b-ef12ab24f0ee", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RégieImmeublePorteLotAnnéeMoisCatégorieFournisseurLibelléDébitCréditImpact
0Imi GéranceB9B0920201Loyer ChargeNaNRègl. Loyer 01/20200.0100.48100.48
1Imi GéranceS5S0520201Loyer ChargeNaNRègl. Prov. Char 01/20200.0191.00191.00
2Imi GéranceS5S0520201Loyer ChargeNaNRègl. Loyer 01/20200.0745.39745.39
3Imi GéranceS2S0220201Loyer ChargeNaNRègl. Prov. Char 01/20200.0519.00519.00
4Imi GéranceS2S0220201Loyer ChargeNaNRègl. Loyer 01 à 03/20200.03473.793473.79
\n", "
" ], "text/plain": [ " Régie Immeuble Porte Lot Année Mois Catégorie Fournisseur \\\n", "0 Imi Gérance B 9 B09 2020 1 Loyer Charge NaN \n", "1 Imi Gérance S 5 S05 2020 1 Loyer Charge NaN \n", "2 Imi Gérance S 5 S05 2020 1 Loyer Charge NaN \n", "3 Imi Gérance S 2 S02 2020 1 Loyer Charge NaN \n", "4 Imi Gérance S 2 S02 2020 1 Loyer Charge NaN \n", "\n", " Libellé Débit Crédit Impact \n", "0 Règl. Loyer 01/2020 0.0 100.48 100.48 \n", "1 Règl. Prov. Char 01/2020 0.0 191.00 191.00 \n", "2 Règl. Loyer 01/2020 0.0 745.39 745.39 \n", "3 Règl. Prov. Char 01/2020 0.0 519.00 519.00 \n", "4 Règl. Loyer 01 à 03/2020 0.0 3473.79 3473.79 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfs = []\n", "for file in files:\n", " dfs.append(pd.read_csv(file))\n", "df = pd.concat(dfs)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "012dcdaf-83de-44e3-b480-c5498421dc8f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RégieImmeublePorteLotAnnéeMoisCatégorieFournisseurLibelléDébitCréditImpact
0Imi GéranceB9B0920201Loyer ChargeRègl. Loyer 01/20200.0100.48100.48
1Imi GéranceS5S0520201Loyer ChargeRègl. Prov. Char 01/20200.0191.00191.00
2Imi GéranceS5S0520201Loyer ChargeRègl. Loyer 01/20200.0745.39745.39
3Imi GéranceS2S0220201Loyer ChargeRègl. Prov. Char 01/20200.0519.00519.00
4Imi GéranceS2S0220201Loyer ChargeRègl. Loyer 01 à 03/20200.03473.793473.79
\n", "
" ], "text/plain": [ " Régie Immeuble Porte Lot Année Mois Catégorie Fournisseur \\\n", "0 Imi Gérance B 9 B09 2020 1 Loyer Charge \n", "1 Imi Gérance S 5 S05 2020 1 Loyer Charge \n", "2 Imi Gérance S 5 S05 2020 1 Loyer Charge \n", "3 Imi Gérance S 2 S02 2020 1 Loyer Charge \n", "4 Imi Gérance S 2 S02 2020 1 Loyer Charge \n", "\n", " Libellé Débit Crédit Impact \n", "0 Règl. Loyer 01/2020 0.0 100.48 100.48 \n", "1 Règl. Prov. Char 01/2020 0.0 191.00 191.00 \n", "2 Règl. Loyer 01/2020 0.0 745.39 745.39 \n", "3 Règl. Prov. Char 01/2020 0.0 519.00 519.00 \n", "4 Règl. Loyer 01 à 03/2020 0.0 3473.79 3473.79 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df[~df[\"Libellé\"].isna()]\n", "df = df.assign(\n", " Fournisseur = df[\"Fournisseur\"].fillna(\"\")\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 13, "id": "396257d6-77bc-4fc7-9347-29698e1d2399", "metadata": {}, "outputs": [], "source": [ "X = df[\"Libellé\"]# + df[\"Fournisseur\"]\n", "y = df[\"Catégorie\"]" ] }, { "cell_type": "code", "execution_count": 14, "id": "5c63ad34-5fe9-41ab-8a34-c9a6003f77e3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "31929\n", "5857 Honoraires Bien COP33M- 15\n", "5858 Honoraires Bien COP33M- 16\n", "5859 Honoraires Bien COP33M- 17\n", "5860 Honoraires Bien COP33M- 18\n", "5861 Honoraires Bien COP33M- 19\n", "Name: Libellé, dtype: object\n" ] } ], "source": [ "print(len(X))\n", "print(X.tail())" ] }, { "cell_type": "markdown", "id": "273daee3-b0e2-4adf-8c0a-b1152e139abb", "metadata": {}, "source": [ "## Exploration de l'actuel" ] }, { "cell_type": "code", "execution_count": 7, "id": "76fa04a7-7087-4af3-a05d-e5de591f1cd2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df.Catégorie.value_counts().plot.bar()" ] }, { "cell_type": "code", "execution_count": null, "id": "1a8120d1-5c88-4e31-a4e0-0857309e0c9b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "3db38e8c-4f5d-4823-954a-18300de9074d", "metadata": {}, "source": [ "## Découpage des datas" ] }, { "cell_type": "code", "execution_count": 7, "id": "8295411a-1f7e-43c7-ba19-a5a835a09223", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 8, "id": "d7b1dbf9-e068-4a75-b714-d9bf88f7d028", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "markdown", "id": "5e65fd83-a6ad-4f7c-b00c-9b9cda448074", "metadata": {}, "source": [ "## Tokenisation des Libellé" ] }, { "cell_type": "code", "execution_count": 9, "id": "962a08a1-2dd1-4da3-bcf1-1e5f3f741a24", "metadata": {}, "outputs": [], "source": [ "from nltk.stem import SnowballStemmer\n", "from sklearn.feature_extraction.text import CountVectorizer" ] }, { "cell_type": "code", "execution_count": 10, "id": "7f39c305-25a6-4ac1-9e1c-e0337f2783b8", "metadata": {}, "outputs": [], "source": [ "stemmer = SnowballStemmer('french')\n", "analyzer = CountVectorizer().build_analyzer()" ] }, { "cell_type": "code", "execution_count": 11, "id": "98b52881-6d37-4802-9a70-3963b6f03eae", "metadata": {}, "outputs": [], "source": [ "def stemmed_words(doc):\n", " return (stemmer.stem(w) for w in analyzer(doc))" ] }, { "cell_type": "code", "execution_count": 12, "id": "5ff329eb-b7b8-48e1-95f8-dfde26285be1", "metadata": {}, "outputs": [], "source": [ "vectorizer = CountVectorizer(analyzer=stemmed_words)" ] }, { "cell_type": "markdown", "id": "ccc1eba6-439a-4bf9-87df-a2b225079ae7", "metadata": {}, "source": [ "## Créations de modèles" ] }, { "cell_type": "code", "execution_count": 13, "id": "71da2f84-b75a-4adc-8533-956312c3fd94", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('vect',\n",
       "                 CountVectorizer(analyzer=<function stemmed_words at 0x7017e451cfe0>)),\n",
       "                ('clf', MultinomialNB())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('vect',\n", " CountVectorizer(analyzer=)),\n", " ('clf', MultinomialNB())])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.naive_bayes import MultinomialNB\n", "\n", "mnb_pipeline = Pipeline([\n", " ('vect', vectorizer),\n", " ('clf', MultinomialNB())\n", "])\n", "mnb_pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 14, "id": "c5cc500a-5e49-4e07-9a9f-e410dd35b69e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/svm/_classes.py:31: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.\n", " warnings.warn(\n" ] }, { "data": { "text/html": [ "
Pipeline(steps=[('vect',\n",
       "                 CountVectorizer(analyzer=<function stemmed_words at 0x7017e451cfe0>)),\n",
       "                ('clf', LinearSVC())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('vect',\n", " CountVectorizer(analyzer=)),\n", " ('clf', LinearSVC())])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.svm import LinearSVC\n", "\n", "svc_pipeline = Pipeline([\n", " ('vect', vectorizer),\n", " ('clf', LinearSVC())\n", "])\n", "svc_pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 15, "id": "72387f32-b462-4113-8292-c5e88ffd5712", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('vect',\n",
       "                 CountVectorizer(analyzer=<function stemmed_words at 0x7017e451cfe0>)),\n",
       "                ('clf', SGDClassifier())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('vect',\n", " CountVectorizer(analyzer=)),\n", " ('clf', SGDClassifier())])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import SGDClassifier\n", "\n", "\n", "svm_pipeline = Pipeline([\n", " ('vect', vectorizer),\n", " ('clf', SGDClassifier())\n", "])\n", "svm_pipeline.fit(X_train, y_train)" ] }, { "cell_type": "markdown", "id": "93cde7ff-1ecd-4c2e-a2c8-3fabb914c78f", "metadata": {}, "source": [ "## Évaluation" ] }, { "cell_type": "code", "execution_count": 16, "id": "cbbc44ed-74ab-407e-8acf-e668513498aa", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": 17, "id": "f5450692-99d9-4080-b0a6-75b73cb9146e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MNB ccuracy: 94.15%\n" ] } ], "source": [ "y_pred = mnb_pipeline.predict(X_test)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"MNB ccuracy: {:.2f}%\".format(accuracy * 100))" ] }, { "cell_type": "code", "execution_count": 18, "id": "0fd79ffb-752b-4886-994d-ed489b1950d1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "742 33BLO- DIAG LOT 4\n", "928 33BLO- PLAQUES LOC\n", "3466 4SER-lot 1 FRAIS COMM DIAG\n", "65 FORFAIT REGLAGE HORLOGE\n", "219 PC - ENTRETIEN ELECTRICITE\n", " ... \n", "51 Solde Départ - Remboursement Solde D.G. Du 120...\n", "669 Gestion impaye locataire ALUR Du 28/09/2\n", "2188 4SER - Mise en demeure KALAI\n", "3251 33BLO-LOT REMISE GESTION\n", "1665 1MAR - MAINTENANCE ELECTRIQUE\n", "Length: 186, dtype: object" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test[y_test!=y_pred]" ] }, { "cell_type": "code", "execution_count": 19, "id": "17191e07-3a77-415b-947e-2475c0e42e08", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SVC Accuracy: 95.85%\n" ] } ], "source": [ "y_pred = svc_pipeline.predict(X_test)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"SVC Accuracy: {:.2f}%\".format(accuracy * 100))" ] }, { "cell_type": "code", "execution_count": 20, "id": "cbcaf96f-56ce-4618-b578-d1498fe78d20", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3062 33BLO- LOT 15 PLAQUES\n", "928 33BLO- PLAQUES LOC\n", "268 1MAR-CONSOMMATION EAU\n", "440 PC - CONTRAT ASCENSEUR\n", "2085 vac hor INST COMPTEUR ELEC\n", " ... \n", "1057 Accès Extranet 2020\n", "51 Solde Départ - Remboursement Solde D.G. Du 120...\n", "2188 4SER - Mise en demeure KALAI\n", "3162 1MAR- SUIVI TRAVAUX DEBARRASS\n", "1665 1MAR - MAINTENANCE ELECTRIQUE\n", "Length: 132, dtype: object" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test[y_test!=y_pred]" ] }, { "cell_type": "code", "execution_count": 21, "id": "8367c994-de9e-4977-b703-cd26ee4f9eb9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SVC Accuracy: 95.97%\n" ] } ], "source": [ "y_pred = svm_pipeline.predict(X_test)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"SVC Accuracy: {:.2f}%\".format(accuracy * 100))" ] }, { "cell_type": "code", "execution_count": 22, "id": "d5c458d1-4b2b-410a-8d77-fffea9f6a46e", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "3062 33BLO- LOT 15 PLAQUES\n", "928 33BLO- PLAQUES LOC\n", "268 1MAR-CONSOMMATION EAU\n", "65 FORFAIT REGLAGE HORLOGE\n", "123 1MAR- dossier Grosjean\n", " ... \n", "1057 Accès Extranet 2020\n", "51 Solde Départ - Remboursement Solde D.G. Du 120...\n", "2188 4SER - Mise en demeure KALAI\n", "3162 1MAR- SUIVI TRAVAUX DEBARRASS\n", "1665 1MAR - MAINTENANCE ELECTRIQUE\n", "Length: 128, dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(X_test[y_test!=y_pred])" ] }, { "cell_type": "code", "execution_count": 23, "id": "6e71a627-8bb6-470e-aba8-cb82c42b4fda", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3062 33BLO- LOT 15 PLAQUES\n", "928 33BLO- PLAQUES LOC\n", "268 1MAR-CONSOMMATION EAU\n", "65 FORFAIT REGLAGE HORLOGE\n", "123 1MAR- dossier Grosjean\n", "440 PC - CONTRAT ASCENSEUR\n", "3078 33BLO- VAC HOR PB CANALISA OFF\n", "1662 4SER - MAINTENANCE ELECTRIQUE\n", "85 DESINSECTISATION PUNAISES\n", "1550 4 SER - EDF ASCENSEUR\n", "1710 Extranet gestion locative 2017\n", "1061 Accès Extranet 2020\n", "2530 1MAR- RAMONAGE\n", "3800 1MAR- lot 6 plaques\n", "3766 1MAR- lot 6 SUIVI TRAVAUX\n", "1998 4SER- PLAQUES LOT 7\n", "1865 Frais suivi d'impaye Du 01052020 Au 3105\n", "4246 33blo- LOGE SUIVI TRAVAUX\n", "3160 Suivi travaux debarrassage\n", "144 S3 - Reception travaux\n", "1285 1 MAR - Eau gd Lyon\n", "1708 Extranet gestion locative 2017\n", "1716 4SER - Contrat ascenseur\n", "2855 33BLO- TT COMM DIAG LOT 4\n", "167 4SER- SUIVI REPAR ASCENSEUR\n", "3532 4SER- NETTOYAGE VITRAGES\n", "1107 1MAR- lOT 13 GROSJEAN HUISSIER\n", "518 4SER-TEL ASCENSEUR 1TRIM2019\n", "1345 33BLO- LOT 18 COMM DIAGNOSTICS\n", "2555 Accès Extranet 2018\n", "598 20 - PLAQUES BAL\n", "1011 Accès Extranet 2020\n", "747 33BLO- LOT 17 RED NVEAU BAIL\n", "1730 4 SER - Tél ascenseur\n", "1183 4SER - EDF ASCENSEUR\n", "1632 4S-CONTRAT ASCENSEUR-3TRIM\n", "2007 1MAR- PLAQUES LOT 9\n", "1704 Extranet gestion locative 2017\n", "1056 Accès Extranet 2020\n", "1644 4SER - Contrat ascenseur\n", "2839 4SER- LOT 9PLAQUES\n", "516 4SER - Travaux tél ascenseur\n", "2488 1MAR- ENTRETIEN ASCENSEUR\n", "749 4 SER - TELEPHONE ASCENSEUR\n", "1991 33blo- lot 17 PLAQUES\n", "298 7 - REMISE AUX NORMES ELECTRCITE\n", "800 Avis de valeur\n", "79 1MAR - Huissier doss. Grosjean\n", "38 1MAR - LARMIERS CAVES\n", "4473 33MB-Lot11 -Sommation huissier\n", "4455 4SER - Mise en demeure KALAI\n", "211 Honoraires suivi recouvrement GROSJEAN S\n", "1236 33MB- Plaque signalétique\n", "508 1MAR- suivi trx ascen 1h offer\n", "364 1MAR-Travaux fuite ascenseur\n", "2016 Commde diagnostic Lot 19-33MBL\n", "343 Etat des risques 33M - LOT 6\n", "750 4SER - MAINTENANCE ELECTRIQUE\n", "3636 Rembt Annul frais impayé\n", "364 PC CONTRAT REGLAGE HORLOGE\n", "3174 33BLO- SUIVI TRAVAUX\n", "4594 33MB - Plaques lot 4\n", "92 RAMONAGE 1ER SEMESTRE 2017\n", "1700 Extranet gestion locative 2017\n", "2520 Accès Extranet 2018\n", "2850 1MAR- LOT 8 VAC HOR TRAVAUX\n", "2289 1MAR - LARMIERS CAVES\n", "1053 Accès Extranet 2020\n", "121 1MAR-Contrat ascenseur 1T 2020\n", "2554 Accès Extranet 2018\n", "3632 4SER-LOT 9 PLAQUES\n", "2533 Accès Extranet 2018\n", "1406 4SER- lot 8 COURR AVOCAT NUISA\n", "4366 4SER- LOT 12 PLAQUES\n", "4357 vac hor install compteur elec\n", "4450 4 SER - CT Ascenseur 1T2020\n", "3163 33BLO- SUIVI TRAVAUX DEBARRASS\n", "2318 Accès Extranet 2019\n", "1992 ESTIMATION VALEUR VENALE\n", "2775 1MAR - MAINTENANCE ELECTRICITE\n", "292 DESOURISATION PARTIES PRIVATIVES\n", "2556 Accès Extranet 2018\n", "1706 Extranet gestion locative 2017\n", "1475 33M-Lot11- Affaire PICARD\n", "4082 4SER - Sommation Versini\n", "1647 1MAR - Contrat ascenseur\n", "177 1 MAR - Eau Gd LYON\n", "685 PC - CONTRAT ASCESENEUR\n", "2540 Accès Extranet 2018\n", "2179 4SER - Réparation ascenseur\n", "2296 Accès Extranet 2019\n", "1408 33BLO- lot 16 FRAIS COMM DIAG\n", "4247 33blo- LOGE SUIVI TRAVAUX\n", "1975 33blo- LOGE SUIVI TRX offert\n", "510 1MAR- LOT 2 SUIVI TRX\n", "1714 1MAR-Lot 13-Frais huissier\n", "4300 TT COMMANDE DIAGNOSTICS\n", "917 4SER- LOT 8 FRAIS AVOCAT\n", "228 4 SER - TELEPHONE ASCENSEUR\n", "78 1MAR - Huissier doss. Grosjean\n", "2006 33MB- PLAQUES LOT 6\n", "1854 PC - 3ème trimestre 2020\n", "1002 33MB-Lot 17 - Plaques BAL\n", "2221 Forfait nego loyers suite COV1 Loc ASSOCIES A2...\n", "1167 33BLO- LOT 12 REDACTION BAIL\n", "1043 Accès Extranet 2020\n", "1073 1 MAR - Entretien ascenseur\n", "3169 Rbst soc ADICTUM-4SER\n", "385 Honoraires suivi de procedure GROSJEAN S\n", "1169 1MAR - Lot 6 -Frais diagnostic\n", "1937 Remboursement Solde D.G. Du 06082020\n", "143 Distribution cle/badge aux loc suite nouvelles...\n", "753 1MAR - MAINTENANCE ELECTRIQUE\n", "2532 Accès Extranet 2018\n", "534 Commde diagnostic Lot 7-4SERV\n", "3 4 SER - CT ASCENSEUR 1T2018\n", "1387 1MAR- LOT 10 PLAQUES\n", "2205 1MAR - Plaques lot 13\n", "1703 Extranet gestion locative 2017\n", "3644 1MAR- LOT 6 SUIVI TRAVAUX\n", "360 vacation horaire travaux\n", "98 PC - TELEPHONIE ASCENSEUR\n", "842 3 - RACORDEMENT ELECTRIQUE SRUDIO RDC\n", "1057 Accès Extranet 2020\n", "51 Solde Départ - Remboursement Solde D.G. Du 120...\n", "2188 4SER - Mise en demeure KALAI\n", "3162 1MAR- SUIVI TRAVAUX DEBARRASS\n", "1665 1MAR - MAINTENANCE ELECTRIQUE\n", "dtype: object\n" ] } ], "source": [ "with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also\n", " print(X_test[y_test!=y_pred])" ] }, { "cell_type": "markdown", "id": "61a935da-40fd-4043-9109-ec97635dfc00", "metadata": {}, "source": [ "## Optimisations\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "5321423f-55d4-4241-815c-22a516460bf6", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV" ] }, { "cell_type": "markdown", "id": "fac29ae8-a68b-434f-82d4-865856222222", "metadata": {}, "source": [ "### Modèle Naive Bayes" ] }, { "cell_type": "code", "execution_count": 25, "id": "366e66d6-5bcf-4000-85d5-6d488220070f", "metadata": {}, "outputs": [], "source": [ "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.feature_extraction.text import TfidfTransformer\n", "\n", "\n", "mnb_pipeline = Pipeline([\n", " ('vect', CountVectorizer()),\n", " #('tfid', TfidfTransformer()),\n", " ('clf', MultinomialNB())\n", "])" ] }, { "cell_type": "code", "execution_count": 26, "id": "008f18a2-5538-412e-8863-c287aef8af0d", "metadata": {}, "outputs": [], "source": [ "parameters = {\n", " 'vect__ngram_range': [(1, 1), (1, 2), (2,2)],\n", " #'tfidf__use_idf': (True, False),\n", " 'clf__alpha': (1, 1e-1,1e-2, 1e-3,),\n", "}" ] }, { "cell_type": "code", "execution_count": 27, "id": "70fe6252-a653-4cfe-89d4-809518e7968b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n", " warnings.warn(\n" ] } ], "source": [ "gs_clf = GridSearchCV(mnb_pipeline, parameters, n_jobs=-1)\n", "gs_clf = gs_clf.fit(X, y)" ] }, { "cell_type": "code", "execution_count": 28, "id": "f5a9281a-a37d-4373-8710-9ea089d39ddb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9446366782006921" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gs_clf.best_score_" ] }, { "cell_type": "code", "execution_count": 29, "id": "2d84af17-194e-4718-9b96-94cb1b5330e1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'clf__alpha': 0.001, 'vect__ngram_range': (1, 2)}" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gs_clf.best_params_" ] }, { "cell_type": "markdown", "id": "ac0800c5-58f3-4ee7-876c-9d3e4e9eed57", "metadata": {}, "source": [ "### Linear SVC" ] }, { "cell_type": "code", "execution_count": 34, "id": "f4e02729-9559-4579-b4a7-f875a3becb72", "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfTransformer\n", "from sklearn.svm import LinearSVC\n", "\n", "svc_pipeline = Pipeline([\n", " ('vect', CountVectorizer()),\n", " #('tfid', TfidfTransformer()),\n", " ('clf', LinearSVC())\n", "])" ] }, { "cell_type": "code", "execution_count": null, "id": "578baa50-8d47-4a1a-a5f3-f2f992245693", "metadata": {}, "outputs": [], "source": [ "parameters = {\n", " 'vect__ngram_range': [(1, 1), (1, 2), (2,2)],\n", " #'tfidf__use_idf': (True, False),\n", " 'clf__alpha': (1, 1e-1,1e-2, 1e-3,),\n", "}" ] }, { "cell_type": "markdown", "id": "e894aebd-b0d8-4ce9-900b-0a6afb7e10bd", "metadata": {}, "source": [ "### SGD" ] }, { "cell_type": "code", "execution_count": 35, "id": "6a793449-8540-43e2-9683-f81cfe47488b", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import SGDClassifier\n", "\n", "\n", "sgd_pipeline = Pipeline([\n", " ('vect', vectorizer),\n", " ('clf', SGDClassifier())\n", "])" ] }, { "cell_type": "code", "execution_count": 36, "id": "a871784a-05f6-471b-a972-d154db3ed181", "metadata": {}, "outputs": [], "source": [ "parameters = {\n", " 'vect__ngram_range': [(1, 1), (1, 2), (2,2)],\n", " #'tfidf__use_idf': (True, False),\n", " 'clf__tol': (1, 1e-1,1e-2, 1e-3,),\n", "}" ] }, { "cell_type": "code", "execution_count": 37, "id": "9efff414-1052-4e42-b60d-a6955ccacfa4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n", "/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n", " warnings.warn(\n" ] } ], "source": [ "gs_clf = GridSearchCV(sgd_pipeline, parameters, n_jobs=-1)\n", "gs_clf = gs_clf.fit(X, y)" ] }, { "cell_type": "code", "execution_count": 38, "id": "eb47b8c4-5eca-4e54-93e2-7a932261aedb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9485372758729159" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gs_clf.best_score_" ] }, { "cell_type": "code", "execution_count": 39, "id": "e9920e95-7720-48cd-83f8-a650a12d9639", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'clf__tol': 0.001, 'vect__ngram_range': (1, 1)}" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gs_clf.best_params_" ] }, { "cell_type": "code", "execution_count": null, "id": "2d5b30f5-3114-4559-8a77-e30d716134a3", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }