plesna/notebooks/auto_tagging.ipynb

2555 lines
149 KiB
Plaintext
Raw Normal View History

2024-02-17 04:28:42 +00:00
{
"cells": [
{
"cell_type": "markdown",
"id": "96263cc4-e4f1-4f42-94cb-14b2d2d35302",
"metadata": {},
"source": [
"# Automatic tagging"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a6d0b19f-9d89-4260-8662-a0f5683d0ec2",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
"from sklearn.pipeline import Pipeline\n",
"from matplotlib import pyplot as plt "
]
},
{
"cell_type": "markdown",
"id": "1585b7a5-d0b9-4781-accd-ee36dddd7bae",
"metadata": {},
"source": [
"## Import des données"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "0751d414-f28e-4e9a-9151-3a1dc1b05f3c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2020.csv'),\n",
" PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2018.csv'),\n",
" PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2022.csv'),\n",
" PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2021.csv'),\n",
" PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2023.csv'),\n",
" PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2019.csv'),\n",
" PosixPath('../PLESNA Compta SYSTEM/staging/CRG/2017.csv')]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"staging_path = Path(\"../PLESNA Compta SYSTEM/staging/CRG/\")\n",
"assert staging_path.exists()\n",
"files = list(staging_path.glob(\"*.csv\"))\n",
"files"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f88989ca-968b-4c97-849b-ef12ab24f0ee",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Régie</th>\n",
" <th>Immeuble</th>\n",
" <th>Porte</th>\n",
" <th>Lot</th>\n",
" <th>Année</th>\n",
" <th>Mois</th>\n",
" <th>Catégorie</th>\n",
" <th>Fournisseur</th>\n",
" <th>Libellé</th>\n",
" <th>Débit</th>\n",
" <th>Crédit</th>\n",
" <th>Impact</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Imi Gérance</td>\n",
" <td>B</td>\n",
" <td>9</td>\n",
" <td>B09</td>\n",
" <td>2020</td>\n",
" <td>1</td>\n",
" <td>Loyer Charge</td>\n",
" <td>NaN</td>\n",
" <td>Règl. Loyer 01/2020</td>\n",
" <td>0.0</td>\n",
" <td>100.48</td>\n",
" <td>100.48</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Imi Gérance</td>\n",
" <td>S</td>\n",
" <td>5</td>\n",
" <td>S05</td>\n",
" <td>2020</td>\n",
" <td>1</td>\n",
" <td>Loyer Charge</td>\n",
" <td>NaN</td>\n",
" <td>Règl. Prov. Char 01/2020</td>\n",
" <td>0.0</td>\n",
" <td>191.00</td>\n",
" <td>191.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Imi Gérance</td>\n",
" <td>S</td>\n",
" <td>5</td>\n",
" <td>S05</td>\n",
" <td>2020</td>\n",
" <td>1</td>\n",
" <td>Loyer Charge</td>\n",
" <td>NaN</td>\n",
" <td>Règl. Loyer 01/2020</td>\n",
" <td>0.0</td>\n",
" <td>745.39</td>\n",
" <td>745.39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Imi Gérance</td>\n",
" <td>S</td>\n",
" <td>2</td>\n",
" <td>S02</td>\n",
" <td>2020</td>\n",
" <td>1</td>\n",
" <td>Loyer Charge</td>\n",
" <td>NaN</td>\n",
" <td>Règl. Prov. Char 01/2020</td>\n",
" <td>0.0</td>\n",
" <td>519.00</td>\n",
" <td>519.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Imi Gérance</td>\n",
" <td>S</td>\n",
" <td>2</td>\n",
" <td>S02</td>\n",
" <td>2020</td>\n",
" <td>1</td>\n",
" <td>Loyer Charge</td>\n",
" <td>NaN</td>\n",
" <td>Règl. Loyer 01 à 03/2020</td>\n",
" <td>0.0</td>\n",
" <td>3473.79</td>\n",
" <td>3473.79</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Régie Immeuble Porte Lot Année Mois Catégorie Fournisseur \\\n",
"0 Imi Gérance B 9 B09 2020 1 Loyer Charge NaN \n",
"1 Imi Gérance S 5 S05 2020 1 Loyer Charge NaN \n",
"2 Imi Gérance S 5 S05 2020 1 Loyer Charge NaN \n",
"3 Imi Gérance S 2 S02 2020 1 Loyer Charge NaN \n",
"4 Imi Gérance S 2 S02 2020 1 Loyer Charge NaN \n",
"\n",
" Libellé Débit Crédit Impact \n",
"0 Règl. Loyer 01/2020 0.0 100.48 100.48 \n",
"1 Règl. Prov. Char 01/2020 0.0 191.00 191.00 \n",
"2 Règl. Loyer 01/2020 0.0 745.39 745.39 \n",
"3 Règl. Prov. Char 01/2020 0.0 519.00 519.00 \n",
"4 Règl. Loyer 01 à 03/2020 0.0 3473.79 3473.79 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfs = []\n",
"for file in files:\n",
" dfs.append(pd.read_csv(file))\n",
"df = pd.concat(dfs)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "012dcdaf-83de-44e3-b480-c5498421dc8f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Régie</th>\n",
" <th>Immeuble</th>\n",
" <th>Porte</th>\n",
" <th>Lot</th>\n",
" <th>Année</th>\n",
" <th>Mois</th>\n",
" <th>Catégorie</th>\n",
" <th>Fournisseur</th>\n",
" <th>Libellé</th>\n",
" <th>Débit</th>\n",
" <th>Crédit</th>\n",
" <th>Impact</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Imi Gérance</td>\n",
" <td>B</td>\n",
" <td>9</td>\n",
" <td>B09</td>\n",
" <td>2020</td>\n",
" <td>1</td>\n",
" <td>Loyer Charge</td>\n",
" <td></td>\n",
" <td>Règl. Loyer 01/2020</td>\n",
" <td>0.0</td>\n",
" <td>100.48</td>\n",
" <td>100.48</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Imi Gérance</td>\n",
" <td>S</td>\n",
" <td>5</td>\n",
" <td>S05</td>\n",
" <td>2020</td>\n",
" <td>1</td>\n",
" <td>Loyer Charge</td>\n",
" <td></td>\n",
" <td>Règl. Prov. Char 01/2020</td>\n",
" <td>0.0</td>\n",
" <td>191.00</td>\n",
" <td>191.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Imi Gérance</td>\n",
" <td>S</td>\n",
" <td>5</td>\n",
" <td>S05</td>\n",
" <td>2020</td>\n",
" <td>1</td>\n",
" <td>Loyer Charge</td>\n",
" <td></td>\n",
" <td>Règl. Loyer 01/2020</td>\n",
" <td>0.0</td>\n",
" <td>745.39</td>\n",
" <td>745.39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Imi Gérance</td>\n",
" <td>S</td>\n",
" <td>2</td>\n",
" <td>S02</td>\n",
" <td>2020</td>\n",
" <td>1</td>\n",
" <td>Loyer Charge</td>\n",
" <td></td>\n",
" <td>Règl. Prov. Char 01/2020</td>\n",
" <td>0.0</td>\n",
" <td>519.00</td>\n",
" <td>519.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Imi Gérance</td>\n",
" <td>S</td>\n",
" <td>2</td>\n",
" <td>S02</td>\n",
" <td>2020</td>\n",
" <td>1</td>\n",
" <td>Loyer Charge</td>\n",
" <td></td>\n",
" <td>Règl. Loyer 01 à 03/2020</td>\n",
" <td>0.0</td>\n",
" <td>3473.79</td>\n",
" <td>3473.79</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Régie Immeuble Porte Lot Année Mois Catégorie Fournisseur \\\n",
"0 Imi Gérance B 9 B09 2020 1 Loyer Charge \n",
"1 Imi Gérance S 5 S05 2020 1 Loyer Charge \n",
"2 Imi Gérance S 5 S05 2020 1 Loyer Charge \n",
"3 Imi Gérance S 2 S02 2020 1 Loyer Charge \n",
"4 Imi Gérance S 2 S02 2020 1 Loyer Charge \n",
"\n",
" Libellé Débit Crédit Impact \n",
"0 Règl. Loyer 01/2020 0.0 100.48 100.48 \n",
"1 Règl. Prov. Char 01/2020 0.0 191.00 191.00 \n",
"2 Règl. Loyer 01/2020 0.0 745.39 745.39 \n",
"3 Règl. Prov. Char 01/2020 0.0 519.00 519.00 \n",
"4 Règl. Loyer 01 à 03/2020 0.0 3473.79 3473.79 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df[~df[\"Libellé\"].isna()]\n",
"df = df.assign(\n",
" Fournisseur = df[\"Fournisseur\"].fillna(\"\")\n",
")\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "396257d6-77bc-4fc7-9347-29698e1d2399",
"metadata": {},
"outputs": [],
"source": [
"X = df[\"Libellé\"]# + df[\"Fournisseur\"]\n",
"y = df[\"Catégorie\"]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "5c63ad34-5fe9-41ab-8a34-c9a6003f77e3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"31929\n",
"5857 Honoraires Bien COP33M- 15\n",
"5858 Honoraires Bien COP33M- 16\n",
"5859 Honoraires Bien COP33M- 17\n",
"5860 Honoraires Bien COP33M- 18\n",
"5861 Honoraires Bien COP33M- 19\n",
"Name: Libellé, dtype: object\n"
]
}
],
"source": [
"print(len(X))\n",
"print(X.tail())"
]
},
{
"cell_type": "markdown",
"id": "273daee3-b0e2-4adf-8c0a-b1152e139abb",
"metadata": {},
"source": [
"## Exploration de l'actuel"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "76fa04a7-7087-4af3-a05d-e5de591f1cd2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: xlabel='Catégorie'>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjkAAAJzCAYAAADtKAJnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAACHJklEQVR4nO3dd1QU1/8+8GcB6dWGoEgRGwrWqGjsRiyxxU9s2FFjL9gTJXaNxt6IsSCJsSQxxqhREQsWbCgoKgo21IAmIiIWFLi/P/wxX1awJTM7YfK8zplz3Jlh3ndx2X125s69OiGEABEREZHGGKndACIiIiIlMOQQERGRJjHkEBERkSYx5BAREZEmMeQQERGRJjHkEBERkSYx5BAREZEmMeQQERGRJpmo3QA1ZWdn448//oCNjQ10Op3azSEiIqJ3IITAo0eP4OzsDCOj15+v+U+HnD/++AMuLi5qN4OIiIj+hlu3bqFUqVKv3f6fDjk2NjYAXv6SbG1tVW4NERERvYu0tDS4uLhIn+Ov858OOTmXqGxtbRlyiIiICpi3dTVhx2MiIiLSJIYcIiIi0iSGHCIiItIkhhwiIiLSJIYcIiIi0iSGHCIiItIkhhwiIiLSJIYcIiIi0iSGHCIiItIkhhwiIiLSJIYcIiIi0iSGHCIiItIkhhwiIiLSJIYcIiIi0iQTtRvwb+Y2Yeff+rkbc1rL3BIiIiJ6XzyTQ0RERJrEkENERESaxJBDREREmsSQQ0RERJrEkENERESaxJBDREREmsSQQ0RERJrEkENERESaxJBDREREmsSQQ0RERJrEkENERESaxJBDREREmsSQQ0RERJrEkENERESaxJBDREREmsSQQ0RERJrEkENERESaxJBDREREmsSQQ0RERJrEkENERESaxJBDREREmsSQQ0RERJr03iEnIiICbdq0gbOzM3Q6HbZt26a3XafT5bvMmzdP2sfNzS3P9jlz5ugd59y5c6hfvz7Mzc3h4uKCuXPn5mnLjz/+iAoVKsDc3Bze3t7YtWvX+z4dIiIi0qj3DjmPHz9GlSpVsHz58ny3JyUl6S1r166FTqdDx44d9fabNm2a3n7Dhg2TtqWlpaF58+ZwdXVFVFQU5s2bhylTpmDVqlXSPseOHUPXrl0REBCAs2fPon379mjfvj1iY2Pf9ykRERGRBpm87w+0bNkSLVu2fO32EiVK6D3+9ddf0bhxY3h4eOitt7GxybNvjg0bNuD58+dYu3YtTE1NUalSJURHR2PBggUYMGAAAGDx4sVo0aIFxo4dCwCYPn06wsLCsGzZMgQHB7/v0yIiIiKNUbRPzt27d7Fz504EBATk2TZnzhwUKVIE1apVw7x585CZmSlti4yMRIMGDWBqaiqt8/Pzw+XLl/HgwQNpn2bNmukd08/PD5GRka9tT0ZGBtLS0vQWIiIi0qb3PpPzPtavXw8bGxt88skneuuHDx+O6tWro3Dhwjh27BgmTpyIpKQkLFiwAACQnJwMd3d3vZ9xdHSUtjk4OCA5OVlal3uf5OTk17Zn9uzZmDp1qhxPjYiIiP7lFA05a9euhb+/P8zNzfXWBwYGSv/28fGBqakpPvvsM8yePRtmZmaKtWfixIl6tdPS0uDi4qJYPSIiIlKPYiHn8OHDuHz5MjZv3vzWfWvXro3MzEzcuHED5cuXR4kSJXD37l29fXIe5/Tjed0+r+vnAwBmZmaKhigiIiL691CsT86aNWtQo0YNVKlS5a37RkdHw8jICMWLFwcA+Pr6IiIiAi9evJD2CQsLQ/ny5eHg4CDtEx4ernecsLAw+Pr6yvgsiIiIqKB675CTnp6O6OhoREdHAwCuX7+O6OhoJCYmSvukpaXhxx9/RL9+/fL8fGRkJBYtWoSYmBhcu3YNGzZswKhRo9C9e3cpwHTr1g2mpqYICAjAhQsXsHnzZixevFjvUtOIESOwe/duzJ8/H3FxcZgyZQpOnz6NoUOHvu9TIiIiIg1678tVp0+fRuPGjaXHOcGjV69eCAkJAQBs2rQJQgh07do1z8+bmZlh06ZNmDJlCjIyMuDu7o5Ro0bpBRg7Ozvs3bsXQ4YMQY0aNVC0aFEEBQVJt48DQN26dfHDDz9g0qRJ+Pzzz1G2bFls27YNlStXft+nRERERBqkE0IItRuhlrS0NNjZ2eHhw4ewtbXNs91tws6/ddwbc1r/06YRERHRa7zt8zsH564iIiIiTWLIISIiIk1iyCEiIiJNYsghIiIiTWLIISIiIk1iyCEiIiJNYsghIiIiTWLIISIiIk1iyCEiIiJNYsghIiIiTWLIISIiIk1iyCEiIiJNYsghIiIiTWLIISIiIk1iyCEiIiJNYsghIiIiTWLIISIiIk1iyCEiIiJNYsghIiIiTWLIISIiIk1iyCEiIiJNYsghIiIiTWLIISIiIk1iyCEiIiJNYsghIiIiTWLIISIiIk1iyCEiIiJNYsghIiIiTWLIISIiIk1iyCEiIiJNYsghIiIiTWLIISIiIk1iyCEiIiJNYsghIiIiTWLIISIiIk1675ATERGBNm3awNnZGTqdDtu2bdPb3rt3b+h0Or2lRYsWevukpKTA398ftra2sLe3R0BAANLT0/X2OXfuHOrXrw9zc3O4uLhg7ty5edry448/okKFCjA3N4e3tzd27dr1vk+HiIiINOq9Q87jx49RpUoVLF++/LX7tGjRAklJSdKyceNGve3+/v64cOECwsLCsGPHDkRERGDAgAHS9rS0NDRv3hyurq6IiorCvHnzMGXKFKxatUra59ixY+jatSsCAgJw9uxZtG/fHu3bt0dsbOz7PiUiIiLSIJ0QQvztH9bp8Msvv6B9+/bSut69eyM1NTXPGZ4cly5dgpeXF06dOoWaNWsCAHbv3o1WrVrh9u3bcHZ2xsqVK/HFF18gOTkZpqamAIAJEyZg27ZtiIuLAwB07twZjx8/xo4dO6Rj16lTB1WrVkVwcPA7tT8tLQ12dnZ4+PAhbG1t82x3m7DznY7zqhtzWv+tnyMiIqK3e9vndw5F+uQcPHgQxYsXR/ny5TFo0CDcv39f2hYZGQl7e3sp4ABAs2bNYGRkhBMnTkj7NGjQQAo4AODn54fLly/jwYMH0j7NmjXTq+vn54fIyMjXtisjIwNpaWl6CxEREWmT7CGnRYsWCA0NRXh4OL766iscOnQILVu2RFZWFgAgOTkZxYsX1/sZExMTFC5cGMnJydI+jo6OevvkPH7bPjnb8zN79mzY2dlJi4uLyz97skRERPSvZSL3Abt06SL929vbGz4+PihTpgwOHjyIpk2byl3uvUycOBGBgYHS47S0NAYdIiIijVL8FnIPDw8ULVoUCQkJAIASJUrg3r17evtkZmYiJSUFJUqUkPa5e/eu3j45j9+2T872/JiZmcHW1lZvISIiIm1SPOTcvn0b9+/fh5OTEwDA19cXqampiIqKkvbZv38/srOzUbt2bWmfiIgIvHjxQtonLCwM5cuXh4ODg7RPeHi4Xq2wsDD4+voq/ZSIiIioAHjvkJOeno7o6GhER0cDAK5fv47o6GgkJiYiPT0dY8eOxfHjx3Hjxg2Eh4ejXbt28PT0hJ+fHwCgYsWKaNGiBfr374+TJ0/i6NGjGDp0KLp06QJnZ2cAQLdu3WBqaoqAgABcuHABmzdvxuLFi/UuNY0YMQK7d+/G/PnzERcXhylTpuD06dMYOnSoDL8WIiIiKujeO+ScPn0a1apVQ7Vq1QAAgYGBqFatGoKCgmBsbIxz586hbdu2KFeuHAICAlCjRg0cPnwYZmZm0jE2bNiAChUqoGnTpmjVqhU+/PBDvTFw7OzssHfvXly/fh01atTA6NGjERQUpDeWTt26dfHDDz9g1apVqFKlCn766Sds27YNlStX/ie/DyIiItKIfzROTkHHcXKIiIgKHlXHySEiIiJSG0MOERERaRJDDhEREWkSQw4RERFpEkMOERERaRJDDhEREWkSQw4RERFpEkMOERERaRJDDhEREWkSQw4RERFpEkMOERERaRJDDhEREWkSQw4RERFpEkMOERERaRJDDhEREWkSQw4RERF
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df.Catégorie.value_counts().plot.bar()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a8120d1-5c88-4e31-a4e0-0857309e0c9b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "3db38e8c-4f5d-4823-954a-18300de9074d",
"metadata": {},
"source": [
"## Découpage des datas"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8295411a-1f7e-43c7-ba19-a5a835a09223",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d7b1dbf9-e068-4a75-b714-d9bf88f7d028",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "markdown",
"id": "5e65fd83-a6ad-4f7c-b00c-9b9cda448074",
"metadata": {},
"source": [
"## Tokenisation des Libellé"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "962a08a1-2dd1-4da3-bcf1-1e5f3f741a24",
"metadata": {},
"outputs": [],
"source": [
"from nltk.stem import SnowballStemmer\n",
"from sklearn.feature_extraction.text import CountVectorizer"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7f39c305-25a6-4ac1-9e1c-e0337f2783b8",
"metadata": {},
"outputs": [],
"source": [
"stemmer = SnowballStemmer('french')\n",
"analyzer = CountVectorizer().build_analyzer()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "98b52881-6d37-4802-9a70-3963b6f03eae",
"metadata": {},
"outputs": [],
"source": [
"def stemmed_words(doc):\n",
" return (stemmer.stem(w) for w in analyzer(doc))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "5ff329eb-b7b8-48e1-95f8-dfde26285be1",
"metadata": {},
"outputs": [],
"source": [
"vectorizer = CountVectorizer(analyzer=stemmed_words)"
]
},
{
"cell_type": "markdown",
"id": "ccc1eba6-439a-4bf9-87df-a2b225079ae7",
"metadata": {},
"source": [
"## Créations de modèles"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "71da2f84-b75a-4adc-8533-956312c3fd94",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-1 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-1 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-1 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-1 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-1 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-1 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-1 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-1 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-1 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-1 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-1 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-1 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-1 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-1 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;vect&#x27;,\n",
" CountVectorizer(analyzer=&lt;function stemmed_words at 0x7017e451cfe0&gt;)),\n",
" (&#x27;clf&#x27;, MultinomialNB())])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[(&#x27;vect&#x27;,\n",
" CountVectorizer(analyzer=&lt;function stemmed_words at 0x7017e451cfe0&gt;)),\n",
" (&#x27;clf&#x27;, MultinomialNB())])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;CountVectorizer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html\">?<span>Documentation for CountVectorizer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>CountVectorizer(analyzer=&lt;function stemmed_words at 0x7017e451cfe0&gt;)</pre></div> </div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;MultinomialNB<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.naive_bayes.MultinomialNB.html\">?<span>Documentation for MultinomialNB</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>MultinomialNB()</pre></div> </div></div></div></div></div></div>"
],
"text/plain": [
"Pipeline(steps=[('vect',\n",
" CountVectorizer(analyzer=<function stemmed_words at 0x7017e451cfe0>)),\n",
" ('clf', MultinomialNB())])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.naive_bayes import MultinomialNB\n",
"\n",
"mnb_pipeline = Pipeline([\n",
" ('vect', vectorizer),\n",
" ('clf', MultinomialNB())\n",
"])\n",
"mnb_pipeline.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "c5cc500a-5e49-4e07-9a9f-e410dd35b69e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/svm/_classes.py:31: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/html": [
"<style>#sk-container-id-2 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-2 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-2 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-2 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-2 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-2 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-2 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-2 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-2 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-2 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-2 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-2 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-2 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-2 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-2 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-2 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-2 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-2 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-2 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;vect&#x27;,\n",
" CountVectorizer(analyzer=&lt;function stemmed_words at 0x7017e451cfe0&gt;)),\n",
" (&#x27;clf&#x27;, LinearSVC())])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[(&#x27;vect&#x27;,\n",
" CountVectorizer(analyzer=&lt;function stemmed_words at 0x7017e451cfe0&gt;)),\n",
" (&#x27;clf&#x27;, LinearSVC())])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;CountVectorizer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html\">?<span>Documentation for CountVectorizer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>CountVectorizer(analyzer=&lt;function stemmed_words at 0x7017e451cfe0&gt;)</pre></div> </div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;LinearSVC<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.svm.LinearSVC.html\">?<span>Documentation for LinearSVC</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>LinearSVC()</pre></div> </div></div></div></div></div></div>"
],
"text/plain": [
"Pipeline(steps=[('vect',\n",
" CountVectorizer(analyzer=<function stemmed_words at 0x7017e451cfe0>)),\n",
" ('clf', LinearSVC())])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.svm import LinearSVC\n",
"\n",
"svc_pipeline = Pipeline([\n",
" ('vect', vectorizer),\n",
" ('clf', LinearSVC())\n",
"])\n",
"svc_pipeline.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "72387f32-b462-4113-8292-c5e88ffd5712",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-3 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-3 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-3 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-3 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-3 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-3 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-3 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-3 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-3 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-3 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-3 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-3 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-3 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-3 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-3 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-3 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-3 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-3 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-3 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-3\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;vect&#x27;,\n",
" CountVectorizer(analyzer=&lt;function stemmed_words at 0x7017e451cfe0&gt;)),\n",
" (&#x27;clf&#x27;, SGDClassifier())])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-7\" type=\"checkbox\" ><label for=\"sk-estimator-id-7\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[(&#x27;vect&#x27;,\n",
" CountVectorizer(analyzer=&lt;function stemmed_words at 0x7017e451cfe0&gt;)),\n",
" (&#x27;clf&#x27;, SGDClassifier())])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-8\" type=\"checkbox\" ><label for=\"sk-estimator-id-8\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;CountVectorizer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html\">?<span>Documentation for CountVectorizer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>CountVectorizer(analyzer=&lt;function stemmed_words at 0x7017e451cfe0&gt;)</pre></div> </div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-9\" type=\"checkbox\" ><label for=\"sk-estimator-id-9\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;SGDClassifier<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.linear_model.SGDClassifier.html\">?<span>Documentation for SGDClassifier</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>SGDClassifier()</pre></div> </div></div></div></div></div></div>"
],
"text/plain": [
"Pipeline(steps=[('vect',\n",
" CountVectorizer(analyzer=<function stemmed_words at 0x7017e451cfe0>)),\n",
" ('clf', SGDClassifier())])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.linear_model import SGDClassifier\n",
"\n",
"\n",
"svm_pipeline = Pipeline([\n",
" ('vect', vectorizer),\n",
" ('clf', SGDClassifier())\n",
"])\n",
"svm_pipeline.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"id": "93cde7ff-1ecd-4c2e-a2c8-3fabb914c78f",
"metadata": {},
"source": [
"## Évaluation"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "cbbc44ed-74ab-407e-8acf-e668513498aa",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "f5450692-99d9-4080-b0a6-75b73cb9146e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MNB ccuracy: 94.15%\n"
]
}
],
"source": [
"y_pred = mnb_pipeline.predict(X_test)\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"print(\"MNB ccuracy: {:.2f}%\".format(accuracy * 100))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "0fd79ffb-752b-4886-994d-ed489b1950d1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"742 33BLO- DIAG LOT 4\n",
"928 33BLO- PLAQUES LOC\n",
"3466 4SER-lot 1 FRAIS COMM DIAG\n",
"65 FORFAIT REGLAGE HORLOGE\n",
"219 PC - ENTRETIEN ELECTRICITE\n",
" ... \n",
"51 Solde Départ - Remboursement Solde D.G. Du 120...\n",
"669 Gestion impaye locataire ALUR Du 28/09/2\n",
"2188 4SER - Mise en demeure KALAI\n",
"3251 33BLO-LOT REMISE GESTION\n",
"1665 1MAR - MAINTENANCE ELECTRIQUE\n",
"Length: 186, dtype: object"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[y_test!=y_pred]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "17191e07-3a77-415b-947e-2475c0e42e08",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SVC Accuracy: 95.85%\n"
]
}
],
"source": [
"y_pred = svc_pipeline.predict(X_test)\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"print(\"SVC Accuracy: {:.2f}%\".format(accuracy * 100))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "cbcaf96f-56ce-4618-b578-d1498fe78d20",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3062 33BLO- LOT 15 PLAQUES\n",
"928 33BLO- PLAQUES LOC\n",
"268 1MAR-CONSOMMATION EAU\n",
"440 PC - CONTRAT ASCENSEUR\n",
"2085 vac hor INST COMPTEUR ELEC\n",
" ... \n",
"1057 Accès Extranet 2020\n",
"51 Solde Départ - Remboursement Solde D.G. Du 120...\n",
"2188 4SER - Mise en demeure KALAI\n",
"3162 1MAR- SUIVI TRAVAUX DEBARRASS\n",
"1665 1MAR - MAINTENANCE ELECTRIQUE\n",
"Length: 132, dtype: object"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[y_test!=y_pred]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "8367c994-de9e-4977-b703-cd26ee4f9eb9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SVC Accuracy: 95.97%\n"
]
}
],
"source": [
"y_pred = svm_pipeline.predict(X_test)\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"print(\"SVC Accuracy: {:.2f}%\".format(accuracy * 100))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "d5c458d1-4b2b-410a-8d77-fffea9f6a46e",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"3062 33BLO- LOT 15 PLAQUES\n",
"928 33BLO- PLAQUES LOC\n",
"268 1MAR-CONSOMMATION EAU\n",
"65 FORFAIT REGLAGE HORLOGE\n",
"123 1MAR- dossier Grosjean\n",
" ... \n",
"1057 Accès Extranet 2020\n",
"51 Solde Départ - Remboursement Solde D.G. Du 120...\n",
"2188 4SER - Mise en demeure KALAI\n",
"3162 1MAR- SUIVI TRAVAUX DEBARRASS\n",
"1665 1MAR - MAINTENANCE ELECTRIQUE\n",
"Length: 128, dtype: object"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(X_test[y_test!=y_pred])"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "6e71a627-8bb6-470e-aba8-cb82c42b4fda",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3062 33BLO- LOT 15 PLAQUES\n",
"928 33BLO- PLAQUES LOC\n",
"268 1MAR-CONSOMMATION EAU\n",
"65 FORFAIT REGLAGE HORLOGE\n",
"123 1MAR- dossier Grosjean\n",
"440 PC - CONTRAT ASCENSEUR\n",
"3078 33BLO- VAC HOR PB CANALISA OFF\n",
"1662 4SER - MAINTENANCE ELECTRIQUE\n",
"85 DESINSECTISATION PUNAISES\n",
"1550 4 SER - EDF ASCENSEUR\n",
"1710 Extranet gestion locative 2017\n",
"1061 Accès Extranet 2020\n",
"2530 1MAR- RAMONAGE\n",
"3800 1MAR- lot 6 plaques\n",
"3766 1MAR- lot 6 SUIVI TRAVAUX\n",
"1998 4SER- PLAQUES LOT 7\n",
"1865 Frais suivi d'impaye Du 01052020 Au 3105\n",
"4246 33blo- LOGE SUIVI TRAVAUX\n",
"3160 Suivi travaux debarrassage\n",
"144 S3 - Reception travaux\n",
"1285 1 MAR - Eau gd Lyon\n",
"1708 Extranet gestion locative 2017\n",
"1716 4SER - Contrat ascenseur\n",
"2855 33BLO- TT COMM DIAG LOT 4\n",
"167 4SER- SUIVI REPAR ASCENSEUR\n",
"3532 4SER- NETTOYAGE VITRAGES\n",
"1107 1MAR- lOT 13 GROSJEAN HUISSIER\n",
"518 4SER-TEL ASCENSEUR 1TRIM2019\n",
"1345 33BLO- LOT 18 COMM DIAGNOSTICS\n",
"2555 Accès Extranet 2018\n",
"598 20 - PLAQUES BAL\n",
"1011 Accès Extranet 2020\n",
"747 33BLO- LOT 17 RED NVEAU BAIL\n",
"1730 4 SER - Tél ascenseur\n",
"1183 4SER - EDF ASCENSEUR\n",
"1632 4S-CONTRAT ASCENSEUR-3TRIM\n",
"2007 1MAR- PLAQUES LOT 9\n",
"1704 Extranet gestion locative 2017\n",
"1056 Accès Extranet 2020\n",
"1644 4SER - Contrat ascenseur\n",
"2839 4SER- LOT 9PLAQUES\n",
"516 4SER - Travaux tél ascenseur\n",
"2488 1MAR- ENTRETIEN ASCENSEUR\n",
"749 4 SER - TELEPHONE ASCENSEUR\n",
"1991 33blo- lot 17 PLAQUES\n",
"298 7 - REMISE AUX NORMES ELECTRCITE\n",
"800 Avis de valeur\n",
"79 1MAR - Huissier doss. Grosjean\n",
"38 1MAR - LARMIERS CAVES\n",
"4473 33MB-Lot11 -Sommation huissier\n",
"4455 4SER - Mise en demeure KALAI\n",
"211 Honoraires suivi recouvrement GROSJEAN S\n",
"1236 33MB- Plaque signalétique\n",
"508 1MAR- suivi trx ascen 1h offer\n",
"364 1MAR-Travaux fuite ascenseur\n",
"2016 Commde diagnostic Lot 19-33MBL\n",
"343 Etat des risques 33M - LOT 6\n",
"750 4SER - MAINTENANCE ELECTRIQUE\n",
"3636 Rembt Annul frais impayé\n",
"364 PC CONTRAT REGLAGE HORLOGE\n",
"3174 33BLO- SUIVI TRAVAUX\n",
"4594 33MB - Plaques lot 4\n",
"92 RAMONAGE 1ER SEMESTRE 2017\n",
"1700 Extranet gestion locative 2017\n",
"2520 Accès Extranet 2018\n",
"2850 1MAR- LOT 8 VAC HOR TRAVAUX\n",
"2289 1MAR - LARMIERS CAVES\n",
"1053 Accès Extranet 2020\n",
"121 1MAR-Contrat ascenseur 1T 2020\n",
"2554 Accès Extranet 2018\n",
"3632 4SER-LOT 9 PLAQUES\n",
"2533 Accès Extranet 2018\n",
"1406 4SER- lot 8 COURR AVOCAT NUISA\n",
"4366 4SER- LOT 12 PLAQUES\n",
"4357 vac hor install compteur elec\n",
"4450 4 SER - CT Ascenseur 1T2020\n",
"3163 33BLO- SUIVI TRAVAUX DEBARRASS\n",
"2318 Accès Extranet 2019\n",
"1992 ESTIMATION VALEUR VENALE\n",
"2775 1MAR - MAINTENANCE ELECTRICITE\n",
"292 DESOURISATION PARTIES PRIVATIVES\n",
"2556 Accès Extranet 2018\n",
"1706 Extranet gestion locative 2017\n",
"1475 33M-Lot11- Affaire PICARD\n",
"4082 4SER - Sommation Versini\n",
"1647 1MAR - Contrat ascenseur\n",
"177 1 MAR - Eau Gd LYON\n",
"685 PC - CONTRAT ASCESENEUR\n",
"2540 Accès Extranet 2018\n",
"2179 4SER - Réparation ascenseur\n",
"2296 Accès Extranet 2019\n",
"1408 33BLO- lot 16 FRAIS COMM DIAG\n",
"4247 33blo- LOGE SUIVI TRAVAUX\n",
"1975 33blo- LOGE SUIVI TRX offert\n",
"510 1MAR- LOT 2 SUIVI TRX\n",
"1714 1MAR-Lot 13-Frais huissier\n",
"4300 TT COMMANDE DIAGNOSTICS\n",
"917 4SER- LOT 8 FRAIS AVOCAT\n",
"228 4 SER - TELEPHONE ASCENSEUR\n",
"78 1MAR - Huissier doss. Grosjean\n",
"2006 33MB- PLAQUES LOT 6\n",
"1854 PC - 3ème trimestre 2020\n",
"1002 33MB-Lot 17 - Plaques BAL\n",
"2221 Forfait nego loyers suite COV1 Loc ASSOCIES A2...\n",
"1167 33BLO- LOT 12 REDACTION BAIL\n",
"1043 Accès Extranet 2020\n",
"1073 1 MAR - Entretien ascenseur\n",
"3169 Rbst soc ADICTUM-4SER\n",
"385 Honoraires suivi de procedure GROSJEAN S\n",
"1169 1MAR - Lot 6 -Frais diagnostic\n",
"1937 Remboursement Solde D.G. Du 06082020\n",
"143 Distribution cle/badge aux loc suite nouvelles...\n",
"753 1MAR - MAINTENANCE ELECTRIQUE\n",
"2532 Accès Extranet 2018\n",
"534 Commde diagnostic Lot 7-4SERV\n",
"3 4 SER - CT ASCENSEUR 1T2018\n",
"1387 1MAR- LOT 10 PLAQUES\n",
"2205 1MAR - Plaques lot 13\n",
"1703 Extranet gestion locative 2017\n",
"3644 1MAR- LOT 6 SUIVI TRAVAUX\n",
"360 vacation horaire travaux\n",
"98 PC - TELEPHONIE ASCENSEUR\n",
"842 3 - RACORDEMENT ELECTRIQUE SRUDIO RDC\n",
"1057 Accès Extranet 2020\n",
"51 Solde Départ - Remboursement Solde D.G. Du 120...\n",
"2188 4SER - Mise en demeure KALAI\n",
"3162 1MAR- SUIVI TRAVAUX DEBARRASS\n",
"1665 1MAR - MAINTENANCE ELECTRIQUE\n",
"dtype: object\n"
]
}
],
"source": [
"with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also\n",
" print(X_test[y_test!=y_pred])"
]
},
{
"cell_type": "markdown",
"id": "61a935da-40fd-4043-9109-ec97635dfc00",
"metadata": {},
"source": [
"## Optimisations\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "5321423f-55d4-4241-815c-22a516460bf6",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import GridSearchCV"
]
},
{
"cell_type": "markdown",
"id": "fac29ae8-a68b-434f-82d4-865856222222",
"metadata": {},
"source": [
"### Modèle Naive Bayes"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "366e66d6-5bcf-4000-85d5-6d488220070f",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"\n",
"\n",
"mnb_pipeline = Pipeline([\n",
" ('vect', CountVectorizer()),\n",
" #('tfid', TfidfTransformer()),\n",
" ('clf', MultinomialNB())\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "008f18a2-5538-412e-8863-c287aef8af0d",
"metadata": {},
"outputs": [],
"source": [
"parameters = {\n",
" 'vect__ngram_range': [(1, 1), (1, 2), (2,2)],\n",
" #'tfidf__use_idf': (True, False),\n",
" 'clf__alpha': (1, 1e-1,1e-2, 1e-3,),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "70fe6252-a653-4cfe-89d4-809518e7968b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n",
" warnings.warn(\n"
]
}
],
"source": [
"gs_clf = GridSearchCV(mnb_pipeline, parameters, n_jobs=-1)\n",
"gs_clf = gs_clf.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "f5a9281a-a37d-4373-8710-9ea089d39ddb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9446366782006921"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs_clf.best_score_"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "2d84af17-194e-4718-9b96-94cb1b5330e1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'clf__alpha': 0.001, 'vect__ngram_range': (1, 2)}"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs_clf.best_params_"
]
},
{
"cell_type": "markdown",
"id": "ac0800c5-58f3-4ee7-876c-9d3e4e9eed57",
"metadata": {},
"source": [
"### Linear SVC"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "f4e02729-9559-4579-b4a7-f875a3becb72",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"from sklearn.svm import LinearSVC\n",
"\n",
"svc_pipeline = Pipeline([\n",
" ('vect', CountVectorizer()),\n",
" #('tfid', TfidfTransformer()),\n",
" ('clf', LinearSVC())\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "578baa50-8d47-4a1a-a5f3-f2f992245693",
"metadata": {},
"outputs": [],
"source": [
"parameters = {\n",
" 'vect__ngram_range': [(1, 1), (1, 2), (2,2)],\n",
" #'tfidf__use_idf': (True, False),\n",
" 'clf__alpha': (1, 1e-1,1e-2, 1e-3,),\n",
"}"
]
},
{
"cell_type": "markdown",
"id": "e894aebd-b0d8-4ce9-900b-0a6afb7e10bd",
"metadata": {},
"source": [
"### SGD"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "6a793449-8540-43e2-9683-f81cfe47488b",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import SGDClassifier\n",
"\n",
"\n",
"sgd_pipeline = Pipeline([\n",
" ('vect', vectorizer),\n",
" ('clf', SGDClassifier())\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "a871784a-05f6-471b-a972-d154db3ed181",
"metadata": {},
"outputs": [],
"source": [
"parameters = {\n",
" 'vect__ngram_range': [(1, 1), (1, 2), (2,2)],\n",
" #'tfidf__use_idf': (True, False),\n",
" 'clf__tol': (1, 1e-1,1e-2, 1e-3,),\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "9efff414-1052-4e42-b60d-a6955ccacfa4",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n",
"/home/commun/scripts/Plesna/.venv/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:541: UserWarning: The parameter 'ngram_range' will not be used since 'analyzer' is callable'\n",
" warnings.warn(\n"
]
}
],
"source": [
"gs_clf = GridSearchCV(sgd_pipeline, parameters, n_jobs=-1)\n",
"gs_clf = gs_clf.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "eb47b8c4-5eca-4e54-93e2-7a932261aedb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9485372758729159"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs_clf.best_score_"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "e9920e95-7720-48cd-83f8-a650a12d9639",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'clf__tol': 0.001, 'vect__ngram_range': (1, 1)}"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gs_clf.best_params_"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d5b30f5-3114-4559-8a77-e30d716134a3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}