aria-text_classification-da.../expl_spacy.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "051c5726",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from matplotlib import pyplot as plt\n",
    "import numpy as np\n",
    "from wordcloud import WordCloud\n",
    "\n",
    "import spacy\n",
    "from spacy.lang.pt.stop_words import STOP_WORDS\n",
    "from spacy.tokens import DocBin\n",
    "\n",
    "import json\n",
    "import re\n",
    "import string\n",
    "from stop_words import get_stop_words\n",
    "import os\n",
    "\n",
    "import math\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold\n",
    "\n",
    "from imblearn.over_sampling import RandomOverSampler\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "310c031d",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option(\"display.max_colwidth\", 255)\n",
    "pd.set_option('display.max_rows', 255)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc6821d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel(\"lemma-mapping23-nltk.xlsx\")\n",
    "cnt = df.groupby(\"CASISTICA_MOTIVAZIONE\").count() \\\n",
    ".rename(columns={\"Unnamed: 0\": \"count\"})[\"count\"]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "775e1f62",
   "metadata": {},
   "outputs": [],
   "source": [
    "run_count = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "b3b74b7a",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"class_dict = {\"Doc. sanitaria scaduta\" : 20, \n",
    "            \"Esenzione per ultratrentennale\" : 20, \n",
    "            \"Variazioni tecniche\" : 20, \n",
    "            \"Furto tardivo\" : 20, \n",
    "            \"Rientro ecoincentivo\" : 20, \n",
    "            \"Rateizzazione\" : 20, \n",
    "            \"EREDI\" : 20, \n",
    "            \"Domiciliazione assente\" : 20, \n",
    "            \"Esenzione successiva\" : 20, \n",
    "            \"Ecoincentivo\" : 20, \n",
    "            \"Versamento annullato\" : 20, \n",
    "            \"Storico\" : 20, \n",
    "            \"Esenzione PH eredi\" : 20, \n",
    "            \"No minivoltura\" : 20, \n",
    "            \"Perdita possesso con DS\" : 20, \n",
    "            \"Rientro in possesso\" : 20, \n",
    "            \"Esenzione PH o storica successiva\" : 20\n",
    "            }\"\"\"\n",
    "\n",
    "class_dict = {\"Doc. sanitaria scaduta\" : 3, \n",
    "            \"Esenzione per ultratrentennale\" : 3, \n",
    "            \"Variazioni tecniche\" : 3, \n",
    "            \"Furto tardivo\" : 3,\n",
    "            \"Rientro ecoincentivo\" :3\n",
    "            }\n",
    "oversample = RandomOverSampler(sampling_strategy=class_dict)\n",
    "\n",
    "X_over, y_over = oversample.fit_resample(df[[\"NOTE_OPERATORE\", \"CASISTICA_LAVORAZIONE\"]].to_numpy(), df[\"CASISTICA_MOTIVAZIONE\"].to_numpy())\n",
    "df_over = pd.DataFrame({\"NOTE_OPERATORE\":X_over[:,0], \"CASISTICA_LAVORAZIONE\":X_over[:,1],\"CASISTICA_MOTIVAZIONE\":y_over})\n",
    "\n",
    "sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)\n",
    "#sss = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)\n",
    "\n",
    "sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=0)\n",
    "\n",
    "\n",
    "for i, (train_val_index, test_index) in enumerate(sss.split(X_over,  y_over)):\n",
    "    train_val_index_df = df_over.iloc[train_val_index]\n",
    "    test_index_df = df_over.iloc[test_index]\n",
    "\n",
    "for i, (train_index, val_index) in enumerate(sss_val.split(train_val_index_df[[\"NOTE_OPERATORE\", \"CASISTICA_LAVORAZIONE\"]],  train_val_index_df[\"CASISTICA_MOTIVAZIONE\"])):\n",
    "    train_index_df = train_val_index_df.iloc[train_index]\n",
    "    val_index_df = train_val_index_df.iloc[val_index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8773dae8",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in [\"Doc. sanitaria scaduta\", \"Esenzione per ultratrentennale\", \"Variazioni tecniche\", \"Furto tardivo\", \"Rientro ecoincentivo\", \"Rateizzazione\", \"EREDI\", \"Domiciliazione assente\"]:\n",
    "    print(\"I: {}\".format(i))\n",
    "    print(train_index_df[train_index_df[\"CASISTICA_MOTIVAZIONE\"] == i])\n",
    "    print(\"-------------------------------------------------\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "400afd91",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=1)\n",
    "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)\n",
    "\n",
    "train_df = pd.DataFrame({\"NOTE_OPERATORE\":X_train[:,0], \"CASISTICA_MOTIVAZIONE\":y_train})\n",
    "test_df = pd.DataFrame({\"NOTE_OPERATORE\":X_test[:,0], \"CASISTICA_MOTIVAZIONE\":y_test})\n",
    "val_df = pd.DataFrame({\"NOTE_OPERATORE\":X_val[:,0], \"CASISTICA_MOTIVAZIONE\":y_val})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "id": "45c06658",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34"
      ]
     },
     "execution_count": 166,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tmp = pd.DataFrame({\"col\":test_index_df[\"CASISTICA_MOTIVAZIONE\"]})\n",
    "tmp[\"C\"]=1\n",
    "tmp.groupby(\"col\").count().size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "18e51bf4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_casistica_lavorazione(orig_df):\n",
    "    res_df = orig_df.copy()\n",
    "    res_df[\"C\"] = res_df.apply(lambda row: row[\"CASISTICA_LAVORAZIONE\"] in row[\"input_doc\"], axis=1)\n",
    "    res_df[\"input_doc\"] =np.where(res_df[\"C\"], res_df[\"NOTE_OPERATORE\"],res_df[\"NOTE_OPERATORE\"] + \" \" + res_df[\"CASISTICA_LAVORAZIONE\"])\n",
    "    res_df = res_df.drop(\"C\", axis=1)\n",
    "    return res_df\n",
    "\n",
    "def replace_punct(text):\n",
    "    text = ''.join([char for char in text if char not in string.punctuation])\n",
    "    return text\n",
    "\n",
    "def remove_stopwords(text):\n",
    "    ita_stop_words = get_stop_words('italian')\n",
    "    pattern = r'\\b(?:' + '|'.join(re.escape(s) for s in ita_stop_words) + r')\\b'\n",
    "    return re.sub(pattern, '', text)\n",
    "\n",
    "def transform_df(orig_df):\n",
    "    res_df = orig_df.copy()\n",
    "    res_df[\"input_doc\"] = res_df[\"NOTE_OPERATORE\"]\n",
    "    #res_df = add_casistica_lavorazione(res_df)\n",
    "\n",
    "    #res_df[\"input_doc\"] = res_df[\"input_doc\"].apply(lambda x: remove_stopwords(x))\n",
    "    #res_df[\"input_doc\"] = res_df[\"input_doc\"].apply(lambda x: replace_punct(x))\n",
    "    return res_df\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4f34dc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def convert(input_df, outfile):\n",
    "    \n",
    "    nlp = spacy.load(\"it_core_news_md\")\n",
    "    #nlp = spacy.blank(\"it\")\n",
    "    \n",
    "    db = DocBin()\n",
    "      \n",
    "    categories = input_df[\"CASISTICA_MOTIVAZIONE\"].drop_duplicates().to_list()\n",
    "\n",
    "    for index, row in input_df.iterrows():\n",
    "\n",
    "        row_doc = row[\"input_doc\"]\n",
    "        doc = nlp.make_doc(row_doc)\n",
    "        doc.cats = {category: 0 for category in categories}\n",
    "        doc.cats[row[\"CASISTICA_MOTIVAZIONE\"]] = 1\n",
    "        db.add(doc)\n",
    "    db.to_disk(outfile)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "16aec46c",
   "metadata": {},
   "outputs": [],
   "source": [
    "run_count+=1\n",
    "run = \"run_{}\".format(run_count)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "664bc3e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "spacy_data_path = \"C:\\\\Users\\\\a.torchi\\\\Desktop\\\\text_classification\\\\spacy\\\\spacy_data\\\\{}\".format(run)\n",
    "if(not os.path.exists(spacy_data_path)):\n",
    "   os.mkdir(spacy_data_path)\n",
    "\n",
    "spacy_result_path = \"C:\\\\Users\\\\a.torchi\\\\Desktop\\\\text_classification\\\\spacy\\\\spacy_result\\\\{}\".format(run)\n",
    "if(not os.path.exists(spacy_result_path)):\n",
    "   os.mkdir(spacy_result_path)\n",
    "\n",
    "convert(transform_df(train_index_df), os.path.join(spacy_data_path, \"train.spacy\"))\n",
    "convert(transform_df(test_index_df), os.path.join(spacy_data_path, \"test.spacy\"))\n",
    "convert(transform_df(val_index_df), os.path.join(spacy_data_path, \"val.spacy\"))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "024904da",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[38;5;4mℹ Generated config template specific for your use case\u001b[0m\n",
      "- Language: it\n",
      "- Pipeline: textcat\n",
      "- Optimize for: efficiency\n",
      "- Hardware: CPU\n",
      "- Transformer: None\n",
      "\u001b[38;5;2m✔ Auto-filled config with all values\u001b[0m\n",
      "\u001b[38;5;2m✔ Saved config\u001b[0m\n",
      "spacy\\config.cfg\n",
      "You can now add your data and train your pipeline:\n",
      "python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy\n"
     ]
    }
   ],
   "source": [
    "!python -m spacy init config --lang it --pipeline textcat spacy/config.cfg --force\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "0f817027",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[38;5;2m✔ Created output directory: spacy\\spacy_result\\run_9\\textcat_model\u001b[0m\n",
      "\u001b[38;5;4mℹ Saving to output directory:\n",
      "spacy\\spacy_result\\run_9\\textcat_model\u001b[0m\n",
      "\u001b[38;5;4mℹ Using CPU\u001b[0m\n",
      "\u001b[1m\n",
      "=========================== Initializing pipeline ===========================\u001b[0m\n",
      "\u001b[38;5;2m✔ Initialized pipeline\u001b[0m\n",
      "\u001b[1m\n",
      "============================= Training pipeline =============================\u001b[0m\n",
      "\u001b[38;5;4mℹ Pipeline: ['textcat']\u001b[0m\n",
      "\u001b[38;5;4mℹ Initial learn rate: 0.001\u001b[0m\n",
      "E    #       LOSS TEXTCAT  CATS_SCORE  SCORE \n",
      "---  ------  ------------  ----------  ------\n",
      "  0       0          0.03        6.37    0.06\n",
      "  1     200          5.12        9.25    0.09\n",
      "  3     400          3.61       11.22    0.11\n",
      "  6     600          2.68       16.08    0.16\n",
      "  9     800          2.08       19.02    0.19\n",
      " 12    1000          1.66       23.11    0.23\n",
      " 17    1200          1.37       28.34    0.28\n",
      " 23    1400          1.11       31.72    0.32\n",
      " 29    1600          0.92       36.20    0.36\n",
      " 38    1800          0.77       39.51    0.40\n",
      " 48    2000          0.64       38.71    0.39\n",
      " 60    2200          0.54       39.26    0.39\n",
      " 75    2400          0.45       39.72    0.40\n",
      " 90    2600          0.39       40.38    0.40\n",
      "106    2800          0.34       39.79    0.40\n",
      "121    3000          0.30       41.13    0.41\n",
      "136    3200          0.27       43.58    0.44\n",
      "152    3400          0.24       43.38    0.43\n",
      "167    3600          0.22       44.35    0.44\n",
      "183    3800          0.20       44.19    0.44\n",
      "198    4000          0.19       44.34    0.44\n",
      "213    4200          0.18       44.66    0.45\n",
      "229    4400          0.17       44.29    0.44\n",
      "244    4600          0.15       44.49    0.44\n",
      "259    4800          0.15       44.45    0.44\n",
      "275    5000          0.14       44.60    0.45\n",
      "290    5200          0.13       44.55    0.45\n",
      "306    5400          0.12       44.46    0.44\n",
      "321    5600          0.12       44.41    0.44\n",
      "336    5800          0.11       44.25    0.44\n",
      "\u001b[38;5;2m✔ Saved pipeline to output directory\u001b[0m\n",
      "spacy\\spacy_result\\run_9\\textcat_model\\model-last\n",
      "\u001b[38;5;4mℹ Using CPU\u001b[0m\n",
      "\u001b[1m\n",
      "================================== Results ==================================\u001b[0m\n",
      "\n",
      "TOK                 100.00\n",
      "TEXTCAT (macro F)   42.74 \n",
      "SPEED               98743 \n",
      "\n",
      "\u001b[1m\n",
      "=========================== Textcat F (per label) ===========================\u001b[0m\n",
      "\n",
      "                                      P        R        F\n",
      "Periodo diverso                   83.46    86.11    84.77\n",
      "Generica                          81.97    88.76    85.23\n",
      "Esenzione PH senza istanza        59.26    57.14    58.18\n",
      "Esportazione non annotata         42.86    26.09    32.43\n",
      "Acquisto nel mese di rinnovo; concessionario    50.00    13.33    21.05\n",
      "No esenzione PH                   50.00    14.29    22.22\n",
      "Vendita tardiva                   76.14    76.14    76.14\n",
      "No minivoltura                   100.00    50.00    66.67\n",
      "Acquisto nel mese di rinnovo; privato    64.71    55.00    59.46\n",
      "Ricevuta non valida               76.92    50.00    60.61\n",
      "Furto non annotato                69.44    64.10    66.67\n",
      "Generica 2_non allegata documentazione    20.00     8.33    11.76\n",
      "Fermo                             93.33    82.35    87.50\n",
      "Esenzione PH o storica successiva    40.00    33.33    36.36\n",
      "Regione diversa                   44.83    61.90    52.00\n",
      "Domiciliazione tardiva            40.00    28.57    33.33\n",
      "Demolizione                       66.67    46.15    54.55\n",
      "Esenzione successiva               0.00     0.00     0.00\n",
      "Perdita possesso con DS           33.33    25.00    28.57\n",
      "Storico                            0.00     0.00     0.00\n",
      "Sequestro non annotato            50.00    42.86    46.15\n",
      "Veicolo diverso                   88.89    57.14    69.57\n",
      "EREDI                              0.00     0.00     0.00\n",
      "Esenzione PH eredi                 0.00     0.00     0.00\n",
      "Rientro in possesso                0.00     0.00     0.00\n",
      "Esenzione per ultratrentennale   100.00   100.00   100.00\n",
      "Versamento annullato               0.00     0.00     0.00\n",
      "Domiciliazione assente             0.00     0.00     0.00\n",
      "Ecoincentivo                       0.00     0.00     0.00\n",
      "Furto tardivo                    100.00   100.00   100.00\n",
      "Rientro ecoincentivo               0.00     0.00     0.00\n",
      "Variazioni tecniche              100.00   100.00   100.00\n",
      "Doc. sanitaria scaduta           100.00   100.00   100.00\n",
      "Rateizzazione                      0.00     0.00     0.00\n",
      "\n",
      "\u001b[1m\n",
      "======================== Textcat ROC AUC (per label) ========================\u001b[0m\n",
      "\n",
      "                                 ROC AUC\n",
      "Periodo diverso                     0.98\n",
      "Generica                            0.87\n",
      "Esenzione PH senza istanza          0.97\n",
      "Esportazione non annotata           0.92\n",
      "Acquisto nel mese di rinnovo; concessionario      0.94\n",
      "No esenzione PH                     0.90\n",
      "Vendita tardiva                     0.97\n",
      "No minivoltura                      0.91\n",
      "Acquisto nel mese di rinnovo; privato      0.97\n",
      "Ricevuta non valida                 0.97\n",
      "Furto non annotato                  0.99\n",
      "Generica 2_non allegata documentazione      0.53\n",
      "Fermo                               0.96\n",
      "Esenzione PH o storica successiva      0.85\n",
      "Regione diversa                     0.95\n",
      "Domiciliazione tardiva              0.95\n",
      "Demolizione                         0.99\n",
      "Esenzione successiva                0.64\n",
      "Perdita possesso con DS             0.89\n",
      "Storico                             0.98\n",
      "Sequestro non annotato              0.77\n",
      "Veicolo diverso                     0.99\n",
      "EREDI                               0.56\n",
      "Esenzione PH eredi                  0.63\n",
      "Rientro in possesso                 0.80\n",
      "Esenzione per ultratrentennale      1.00\n",
      "Versamento annullato                0.57\n",
      "Domiciliazione assente              1.00\n",
      "Ecoincentivo                        0.87\n",
      "Furto tardivo                       1.00\n",
      "Rientro ecoincentivo                0.93\n",
      "Variazioni tecniche                 1.00\n",
      "Doc. sanitaria scaduta              1.00\n",
      "Rateizzazione                       0.87\n",
      "\n",
      "\u001b[38;5;2m✔ Saved results to spacy\\spacy_result\\run_9\\metrics.json\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!python -m spacy train spacy/config.cfg --paths.train ./spacy/spacy_data/{run}/train.spacy  --paths.dev ./spacy/spacy_data/{run}/val.spacy --output spacy/spacy_result/{run}/textcat_model\n",
    "!python -m spacy evaluate ./spacy/spacy_result/{run}/textcat_model/model-best/ --output ./spacy/spacy_result/{run}/metrics.json ./spacy/spacy_data/{run}/test.spacy"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}