aria-text_classification-da.../expl_spacy.ipynb

465 lines
18 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"id": "051c5726",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np\n",
"from wordcloud import WordCloud\n",
"\n",
"import spacy\n",
"from spacy.lang.pt.stop_words import STOP_WORDS\n",
"from spacy.tokens import DocBin\n",
"\n",
"import json\n",
"import re\n",
"import string\n",
"from stop_words import get_stop_words\n",
"import os\n",
"\n",
"import math\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold\n",
"\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "310c031d",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option(\"display.max_colwidth\", 255)\n",
"pd.set_option('display.max_rows', 255)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cc6821d0",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_excel(\"lemma-mapping23-nltk.xlsx\")\n",
"cnt = df.groupby(\"CASISTICA_MOTIVAZIONE\").count() \\\n",
".rename(columns={\"Unnamed: 0\": \"count\"})[\"count\"]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "775e1f62",
"metadata": {},
"outputs": [],
"source": [
"run_count = 0"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "b3b74b7a",
"metadata": {},
"outputs": [],
"source": [
"\"\"\"class_dict = {\"Doc. sanitaria scaduta\" : 20, \n",
" \"Esenzione per ultratrentennale\" : 20, \n",
" \"Variazioni tecniche\" : 20, \n",
" \"Furto tardivo\" : 20, \n",
" \"Rientro ecoincentivo\" : 20, \n",
" \"Rateizzazione\" : 20, \n",
" \"EREDI\" : 20, \n",
" \"Domiciliazione assente\" : 20, \n",
" \"Esenzione successiva\" : 20, \n",
" \"Ecoincentivo\" : 20, \n",
" \"Versamento annullato\" : 20, \n",
" \"Storico\" : 20, \n",
" \"Esenzione PH eredi\" : 20, \n",
" \"No minivoltura\" : 20, \n",
" \"Perdita possesso con DS\" : 20, \n",
" \"Rientro in possesso\" : 20, \n",
" \"Esenzione PH o storica successiva\" : 20\n",
" }\"\"\"\n",
"\n",
"class_dict = {\"Doc. sanitaria scaduta\" : 3, \n",
" \"Esenzione per ultratrentennale\" : 3, \n",
" \"Variazioni tecniche\" : 3, \n",
" \"Furto tardivo\" : 3,\n",
" \"Rientro ecoincentivo\" :3\n",
" }\n",
"oversample = RandomOverSampler(sampling_strategy=class_dict)\n",
"\n",
"X_over, y_over = oversample.fit_resample(df[[\"NOTE_OPERATORE\", \"CASISTICA_LAVORAZIONE\"]].to_numpy(), df[\"CASISTICA_MOTIVAZIONE\"].to_numpy())\n",
"df_over = pd.DataFrame({\"NOTE_OPERATORE\":X_over[:,0], \"CASISTICA_LAVORAZIONE\":X_over[:,1],\"CASISTICA_MOTIVAZIONE\":y_over})\n",
"\n",
"sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)\n",
"#sss = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)\n",
"\n",
"sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=0)\n",
"\n",
"\n",
"for i, (train_val_index, test_index) in enumerate(sss.split(X_over, y_over)):\n",
" train_val_index_df = df_over.iloc[train_val_index]\n",
" test_index_df = df_over.iloc[test_index]\n",
"\n",
"for i, (train_index, val_index) in enumerate(sss_val.split(train_val_index_df[[\"NOTE_OPERATORE\", \"CASISTICA_LAVORAZIONE\"]], train_val_index_df[\"CASISTICA_MOTIVAZIONE\"])):\n",
" train_index_df = train_val_index_df.iloc[train_index]\n",
" val_index_df = train_val_index_df.iloc[val_index]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8773dae8",
"metadata": {},
"outputs": [],
"source": [
"for i in [\"Doc. sanitaria scaduta\", \"Esenzione per ultratrentennale\", \"Variazioni tecniche\", \"Furto tardivo\", \"Rientro ecoincentivo\", \"Rateizzazione\", \"EREDI\", \"Domiciliazione assente\"]:\n",
" print(\"I: {}\".format(i))\n",
" print(train_index_df[train_index_df[\"CASISTICA_MOTIVAZIONE\"] == i])\n",
" print(\"-------------------------------------------------\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "400afd91",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size=0.2, random_state=1)\n",
"X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)\n",
"\n",
"train_df = pd.DataFrame({\"NOTE_OPERATORE\":X_train[:,0], \"CASISTICA_MOTIVAZIONE\":y_train})\n",
"test_df = pd.DataFrame({\"NOTE_OPERATORE\":X_test[:,0], \"CASISTICA_MOTIVAZIONE\":y_test})\n",
"val_df = pd.DataFrame({\"NOTE_OPERATORE\":X_val[:,0], \"CASISTICA_MOTIVAZIONE\":y_val})"
]
},
{
"cell_type": "code",
"execution_count": 166,
"id": "45c06658",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"34"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tmp = pd.DataFrame({\"col\":test_index_df[\"CASISTICA_MOTIVAZIONE\"]})\n",
"tmp[\"C\"]=1\n",
"tmp.groupby(\"col\").count().size"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "18e51bf4",
"metadata": {},
"outputs": [],
"source": [
"def add_casistica_lavorazione(orig_df):\n",
" res_df = orig_df.copy()\n",
" res_df[\"C\"] = res_df.apply(lambda row: row[\"CASISTICA_LAVORAZIONE\"] in row[\"input_doc\"], axis=1)\n",
" res_df[\"input_doc\"] =np.where(res_df[\"C\"], res_df[\"NOTE_OPERATORE\"],res_df[\"NOTE_OPERATORE\"] + \" \" + res_df[\"CASISTICA_LAVORAZIONE\"])\n",
" res_df = res_df.drop(\"C\", axis=1)\n",
" return res_df\n",
"\n",
"def replace_punct(text):\n",
" text = ''.join([char for char in text if char not in string.punctuation])\n",
" return text\n",
"\n",
"def remove_stopwords(text):\n",
" ita_stop_words = get_stop_words('italian')\n",
" pattern = r'\\b(?:' + '|'.join(re.escape(s) for s in ita_stop_words) + r')\\b'\n",
" return re.sub(pattern, '', text)\n",
"\n",
"def transform_df(orig_df):\n",
" res_df = orig_df.copy()\n",
" res_df[\"input_doc\"] = res_df[\"NOTE_OPERATORE\"]\n",
" #res_df = add_casistica_lavorazione(res_df)\n",
"\n",
" #res_df[\"input_doc\"] = res_df[\"input_doc\"].apply(lambda x: remove_stopwords(x))\n",
" #res_df[\"input_doc\"] = res_df[\"input_doc\"].apply(lambda x: replace_punct(x))\n",
" return res_df\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4f34dc8",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def convert(input_df, outfile):\n",
" \n",
" nlp = spacy.load(\"it_core_news_md\")\n",
" #nlp = spacy.blank(\"it\")\n",
" \n",
" db = DocBin()\n",
" \n",
" categories = input_df[\"CASISTICA_MOTIVAZIONE\"].drop_duplicates().to_list()\n",
"\n",
" for index, row in input_df.iterrows():\n",
"\n",
" row_doc = row[\"input_doc\"]\n",
" doc = nlp.make_doc(row_doc)\n",
" doc.cats = {category: 0 for category in categories}\n",
" doc.cats[row[\"CASISTICA_MOTIVAZIONE\"]] = 1\n",
" db.add(doc)\n",
" db.to_disk(outfile)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "16aec46c",
"metadata": {},
"outputs": [],
"source": [
"run_count+=1\n",
"run = \"run_{}\".format(run_count)\n"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "664bc3e2",
"metadata": {},
"outputs": [],
"source": [
"spacy_data_path = \"C:\\\\Users\\\\a.torchi\\\\Desktop\\\\text_classification\\\\spacy\\\\spacy_data\\\\{}\".format(run)\n",
"if(not os.path.exists(spacy_data_path)):\n",
" os.mkdir(spacy_data_path)\n",
"\n",
"spacy_result_path = \"C:\\\\Users\\\\a.torchi\\\\Desktop\\\\text_classification\\\\spacy\\\\spacy_result\\\\{}\".format(run)\n",
"if(not os.path.exists(spacy_result_path)):\n",
" os.mkdir(spacy_result_path)\n",
"\n",
"convert(transform_df(train_index_df), os.path.join(spacy_data_path, \"train.spacy\"))\n",
"convert(transform_df(test_index_df), os.path.join(spacy_data_path, \"test.spacy\"))\n",
"convert(transform_df(val_index_df), os.path.join(spacy_data_path, \"val.spacy\"))\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "024904da",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[38;5;4m Generated config template specific for your use case\u001b[0m\n",
"- Language: it\n",
"- Pipeline: textcat\n",
"- Optimize for: efficiency\n",
"- Hardware: CPU\n",
"- Transformer: None\n",
"\u001b[38;5;2m✔ Auto-filled config with all values\u001b[0m\n",
"\u001b[38;5;2m✔ Saved config\u001b[0m\n",
"spacy\\config.cfg\n",
"You can now add your data and train your pipeline:\n",
"python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy\n"
]
}
],
"source": [
"!python -m spacy init config --lang it --pipeline textcat spacy/config.cfg --force\n"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "0f817027",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[38;5;2m✔ Created output directory: spacy\\spacy_result\\run_9\\textcat_model\u001b[0m\n",
"\u001b[38;5;4m Saving to output directory:\n",
"spacy\\spacy_result\\run_9\\textcat_model\u001b[0m\n",
"\u001b[38;5;4m Using CPU\u001b[0m\n",
"\u001b[1m\n",
"=========================== Initializing pipeline ===========================\u001b[0m\n",
"\u001b[38;5;2m✔ Initialized pipeline\u001b[0m\n",
"\u001b[1m\n",
"============================= Training pipeline =============================\u001b[0m\n",
"\u001b[38;5;4m Pipeline: ['textcat']\u001b[0m\n",
"\u001b[38;5;4m Initial learn rate: 0.001\u001b[0m\n",
"E # LOSS TEXTCAT CATS_SCORE SCORE \n",
"--- ------ ------------ ---------- ------\n",
" 0 0 0.03 6.37 0.06\n",
" 1 200 5.12 9.25 0.09\n",
" 3 400 3.61 11.22 0.11\n",
" 6 600 2.68 16.08 0.16\n",
" 9 800 2.08 19.02 0.19\n",
" 12 1000 1.66 23.11 0.23\n",
" 17 1200 1.37 28.34 0.28\n",
" 23 1400 1.11 31.72 0.32\n",
" 29 1600 0.92 36.20 0.36\n",
" 38 1800 0.77 39.51 0.40\n",
" 48 2000 0.64 38.71 0.39\n",
" 60 2200 0.54 39.26 0.39\n",
" 75 2400 0.45 39.72 0.40\n",
" 90 2600 0.39 40.38 0.40\n",
"106 2800 0.34 39.79 0.40\n",
"121 3000 0.30 41.13 0.41\n",
"136 3200 0.27 43.58 0.44\n",
"152 3400 0.24 43.38 0.43\n",
"167 3600 0.22 44.35 0.44\n",
"183 3800 0.20 44.19 0.44\n",
"198 4000 0.19 44.34 0.44\n",
"213 4200 0.18 44.66 0.45\n",
"229 4400 0.17 44.29 0.44\n",
"244 4600 0.15 44.49 0.44\n",
"259 4800 0.15 44.45 0.44\n",
"275 5000 0.14 44.60 0.45\n",
"290 5200 0.13 44.55 0.45\n",
"306 5400 0.12 44.46 0.44\n",
"321 5600 0.12 44.41 0.44\n",
"336 5800 0.11 44.25 0.44\n",
"\u001b[38;5;2m✔ Saved pipeline to output directory\u001b[0m\n",
"spacy\\spacy_result\\run_9\\textcat_model\\model-last\n",
"\u001b[38;5;4m Using CPU\u001b[0m\n",
"\u001b[1m\n",
"================================== Results ==================================\u001b[0m\n",
"\n",
"TOK 100.00\n",
"TEXTCAT (macro F) 42.74 \n",
"SPEED 98743 \n",
"\n",
"\u001b[1m\n",
"=========================== Textcat F (per label) ===========================\u001b[0m\n",
"\n",
" P R F\n",
"Periodo diverso 83.46 86.11 84.77\n",
"Generica 81.97 88.76 85.23\n",
"Esenzione PH senza istanza 59.26 57.14 58.18\n",
"Esportazione non annotata 42.86 26.09 32.43\n",
"Acquisto nel mese di rinnovo; concessionario 50.00 13.33 21.05\n",
"No esenzione PH 50.00 14.29 22.22\n",
"Vendita tardiva 76.14 76.14 76.14\n",
"No minivoltura 100.00 50.00 66.67\n",
"Acquisto nel mese di rinnovo; privato 64.71 55.00 59.46\n",
"Ricevuta non valida 76.92 50.00 60.61\n",
"Furto non annotato 69.44 64.10 66.67\n",
"Generica 2_non allegata documentazione 20.00 8.33 11.76\n",
"Fermo 93.33 82.35 87.50\n",
"Esenzione PH o storica successiva 40.00 33.33 36.36\n",
"Regione diversa 44.83 61.90 52.00\n",
"Domiciliazione tardiva 40.00 28.57 33.33\n",
"Demolizione 66.67 46.15 54.55\n",
"Esenzione successiva 0.00 0.00 0.00\n",
"Perdita possesso con DS 33.33 25.00 28.57\n",
"Storico 0.00 0.00 0.00\n",
"Sequestro non annotato 50.00 42.86 46.15\n",
"Veicolo diverso 88.89 57.14 69.57\n",
"EREDI 0.00 0.00 0.00\n",
"Esenzione PH eredi 0.00 0.00 0.00\n",
"Rientro in possesso 0.00 0.00 0.00\n",
"Esenzione per ultratrentennale 100.00 100.00 100.00\n",
"Versamento annullato 0.00 0.00 0.00\n",
"Domiciliazione assente 0.00 0.00 0.00\n",
"Ecoincentivo 0.00 0.00 0.00\n",
"Furto tardivo 100.00 100.00 100.00\n",
"Rientro ecoincentivo 0.00 0.00 0.00\n",
"Variazioni tecniche 100.00 100.00 100.00\n",
"Doc. sanitaria scaduta 100.00 100.00 100.00\n",
"Rateizzazione 0.00 0.00 0.00\n",
"\n",
"\u001b[1m\n",
"======================== Textcat ROC AUC (per label) ========================\u001b[0m\n",
"\n",
" ROC AUC\n",
"Periodo diverso 0.98\n",
"Generica 0.87\n",
"Esenzione PH senza istanza 0.97\n",
"Esportazione non annotata 0.92\n",
"Acquisto nel mese di rinnovo; concessionario 0.94\n",
"No esenzione PH 0.90\n",
"Vendita tardiva 0.97\n",
"No minivoltura 0.91\n",
"Acquisto nel mese di rinnovo; privato 0.97\n",
"Ricevuta non valida 0.97\n",
"Furto non annotato 0.99\n",
"Generica 2_non allegata documentazione 0.53\n",
"Fermo 0.96\n",
"Esenzione PH o storica successiva 0.85\n",
"Regione diversa 0.95\n",
"Domiciliazione tardiva 0.95\n",
"Demolizione 0.99\n",
"Esenzione successiva 0.64\n",
"Perdita possesso con DS 0.89\n",
"Storico 0.98\n",
"Sequestro non annotato 0.77\n",
"Veicolo diverso 0.99\n",
"EREDI 0.56\n",
"Esenzione PH eredi 0.63\n",
"Rientro in possesso 0.80\n",
"Esenzione per ultratrentennale 1.00\n",
"Versamento annullato 0.57\n",
"Domiciliazione assente 1.00\n",
"Ecoincentivo 0.87\n",
"Furto tardivo 1.00\n",
"Rientro ecoincentivo 0.93\n",
"Variazioni tecniche 1.00\n",
"Doc. sanitaria scaduta 1.00\n",
"Rateizzazione 0.87\n",
"\n",
"\u001b[38;5;2m✔ Saved results to spacy\\spacy_result\\run_9\\metrics.json\u001b[0m\n"
]
}
],
"source": [
"!python -m spacy train spacy/config.cfg --paths.train ./spacy/spacy_data/{run}/train.spacy --paths.dev ./spacy/spacy_data/{run}/val.spacy --output spacy/spacy_result/{run}/textcat_model\n",
"!python -m spacy evaluate ./spacy/spacy_result/{run}/textcat_model/model-best/ --output ./spacy/spacy_result/{run}/metrics.json ./spacy/spacy_data/{run}/test.spacy"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}