aria-text_classification-da.../rule.ipynb

2177 lines
89 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "051c5726",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from matplotlib import pyplot as plt\n",
"import numpy as np\n",
"from wordcloud import WordCloud\n",
"\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import KFold, StratifiedShuffleSplit\n",
"from sklearn.metrics import confusion_matrix\n",
"from sklearn.metrics import ConfusionMatrixDisplay\n",
"\n",
"import re\n",
"import string\n",
"from stop_words import get_stop_words\n",
"\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"import spacy\n",
"\n",
"from nltk import ngrams\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "310c031d",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option(\"display.max_colwidth\", 255)\n",
"pd.set_option('display.max_rows', 255)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cc6821d0",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_excel(\"training-nltk-mapping3.xlsx\")\n",
"cnt = df.groupby(\"CASISTICA_MOTIVAZIONE\").count() \\\n",
".rename(columns={\"Unnamed: 0\": \"count\"})[\"count\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4b8cbd26",
"metadata": {},
"outputs": [],
"source": [
"def prune_infrequent_classes(input_df: pd.DataFrame):\n",
" card_thresh = 10\n",
" cnt = input_df[[\"CASISTICA_MOTIVAZIONE\",\"NOTE_OPERATORE\"]].groupby([\"CASISTICA_MOTIVAZIONE\"]).count().rename({\"NOTE_OPERATORE\":\"count\"},axis=1)\n",
" res = input_df.merge(cnt, on=\"CASISTICA_MOTIVAZIONE\")\n",
" res = res[res[\"count\"] >= card_thresh]\n",
" res = res.drop(\"count\", axis=1)\n",
" return res\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "3903d9da",
"metadata": {},
"outputs": [],
"source": [
"def unique_str(l:list[str]):\n",
" return list(set(l))\n",
"\n",
"def apply_ngram(l:list[str], n:int):\n",
" return list(ngrams(l, n))\n",
"\n",
"def join_string_list(l:list[tuple[str]]):\n",
" return [\" \".join(list(i)) for i in l]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0d08201",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\a.torchi\\AppData\\Local\\Temp\\ipykernel_31116\\2333237563.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_cp[\"NOTE_OPERATORE_lower\"] = df_cp[\"NOTE_OPERATORE\"].str.lower()\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CASISTICA_MOTIVAZIONE</th>\n",
" <th>NOTE_OPERATORE_2gram_unique_words</th>\n",
" <th>ngram_occurrencies</th>\n",
" <th>record_per_class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Periodo diverso</td>\n",
" <td>allega vers</td>\n",
" <td>28</td>\n",
" <td>831</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Generica</td>\n",
" <td>veicolo intestato</td>\n",
" <td>15</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Generica</td>\n",
" <td>essere mai</td>\n",
" <td>14</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Periodo diverso</td>\n",
" <td>periodi tributari</td>\n",
" <td>14</td>\n",
" <td>831</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Periodo diverso</td>\n",
" <td>scansiona versamento</td>\n",
" <td>14</td>\n",
" <td>831</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Furto non annotato</td>\n",
" <td>furto trascritto</td>\n",
" <td>12</td>\n",
" <td>116</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Periodo diverso</td>\n",
" <td>relativa vers.</td>\n",
" <td>12</td>\n",
" <td>831</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Periodo diverso</td>\n",
" <td>versamento periodo</td>\n",
" <td>12</td>\n",
" <td>831</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Periodo diverso</td>\n",
" <td>relativa anno</td>\n",
" <td>11</td>\n",
" <td>831</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Generica</td>\n",
" <td>dichiara vendita</td>\n",
" <td>10</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Generica</td>\n",
" <td>intestato società</td>\n",
" <td>10</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Generica</td>\n",
" <td>modalità pagamento</td>\n",
" <td>10</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Generica</td>\n",
" <td>scrive essere</td>\n",
" <td>10</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Periodo diverso</td>\n",
" <td>diverso periodo</td>\n",
" <td>10</td>\n",
" <td>831</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>lombardo mese</td>\n",
" <td>9</td>\n",
" <td>257</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Periodo diverso</td>\n",
" <td>successivo versamento</td>\n",
" <td>9</td>\n",
" <td>831</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Generica</td>\n",
" <td>annotazione carta</td>\n",
" <td>9</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Generica</td>\n",
" <td>demolizione annotata</td>\n",
" <td>9</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Generica</td>\n",
" <td>dichiara demolizione</td>\n",
" <td>9</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Generica</td>\n",
" <td>documentazione carente</td>\n",
" <td>9</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Generica</td>\n",
" <td>nessun atto</td>\n",
" <td>9</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Generica</td>\n",
" <td>risulta intestatario</td>\n",
" <td>9</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>Generica</td>\n",
" <td>documento didentità</td>\n",
" <td>8</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>Generica</td>\n",
" <td>on line</td>\n",
" <td>8</td>\n",
" <td>2763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>venduta dopo</td>\n",
" <td>8</td>\n",
" <td>257</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" CASISTICA_MOTIVAZIONE NOTE_OPERATORE_2gram_unique_words \\\n",
"0 Periodo diverso allega vers \n",
"1 Generica veicolo intestato \n",
"2 Generica essere mai \n",
"3 Periodo diverso periodi tributari \n",
"4 Periodo diverso scansiona versamento \n",
"5 Furto non annotato furto trascritto \n",
"6 Periodo diverso relativa vers. \n",
"7 Periodo diverso versamento periodo \n",
"8 Periodo diverso relativa anno \n",
"9 Generica dichiara vendita \n",
"10 Generica intestato società \n",
"11 Generica modalità pagamento \n",
"12 Generica scrive essere \n",
"13 Periodo diverso diverso periodo \n",
"14 Vendita tardiva lombardo mese \n",
"15 Periodo diverso successivo versamento \n",
"16 Generica annotazione carta \n",
"17 Generica demolizione annotata \n",
"18 Generica dichiara demolizione \n",
"19 Generica documentazione carente \n",
"20 Generica nessun atto \n",
"21 Generica risulta intestatario \n",
"22 Generica documento didentità \n",
"23 Generica on line \n",
"24 Vendita tardiva venduta dopo \n",
"\n",
" ngram_occurrencies record_per_class \n",
"0 28 831 \n",
"1 15 2763 \n",
"2 14 2763 \n",
"3 14 831 \n",
"4 14 831 \n",
"5 12 116 \n",
"6 12 831 \n",
"7 12 831 \n",
"8 11 831 \n",
"9 10 2763 \n",
"10 10 2763 \n",
"11 10 2763 \n",
"12 10 2763 \n",
"13 10 831 \n",
"14 9 257 \n",
"15 9 831 \n",
"16 9 2763 \n",
"17 9 2763 \n",
"18 9 2763 \n",
"19 9 2763 \n",
"20 9 2763 \n",
"21 9 2763 \n",
"22 8 2763 \n",
"23 8 2763 \n",
"24 8 257 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_unique_ngrams(df: pd.DataFrame, n:int):\n",
" df_cp = df[[\"CASISTICA_LAVORAZIONE\",\"NOTE_OPERATORE\",\"CASISTICA_MOTIVAZIONE\"]]\n",
"\n",
" #make all strings lowercase\n",
" df_cp[\"NOTE_OPERATORE_lower\"] = df_cp[\"NOTE_OPERATORE\"].str.lower()\n",
"\n",
" #make column as a list of words (1-gram)\n",
" splitted_col_name = \"NOTE_OPERATORE_split_{}gram\".format(str(n))\n",
" df_cp[\"NOTE_OPERATORE_split_1gram\"] = df_cp[\"NOTE_OPERATORE_lower\"].str.split(\" \")\n",
"\n",
" #if n!=1 make a column as a list of n-grams\n",
" if(n!=1):\n",
" df_cp[splitted_col_name] = df_cp[\"NOTE_OPERATORE_split_1gram\"].apply(lambda x: apply_ngram(x, n))\n",
"\n",
" ngram_col_name = \"NOTE_OPERATORE_{}gram_unique_words\".format(str(n))\n",
"\n",
" #make sure every n-gram in every row is not repeated\n",
" df_cp[ngram_col_name] = df_cp[splitted_col_name].apply(lambda x: unique_str(x))\n",
"\n",
" #concat every word composing a n-gram, transforming the column from a list[tuple[str]] to list[str]\n",
" if(n!=1):\n",
" df_cp[ngram_col_name] = df_cp[ngram_col_name].apply(lambda x: join_string_list(x))\n",
"\n",
"\n",
" #explode the df, to make a row for every n-gram - then groupBy [class, n-gram] to find how many records contain every n-gram\n",
" df_cp_ngram = df_cp[[ngram_col_name, \"CASISTICA_MOTIVAZIONE\", \"CASISTICA_LAVORAZIONE\"]].explode(ngram_col_name) \\\n",
" .groupby([\"CASISTICA_MOTIVAZIONE\",ngram_col_name]).count()\\\n",
" .sort_values(by=[\"CASISTICA_MOTIVAZIONE\", \"CASISTICA_LAVORAZIONE\"], ascending=[True,False])\\\n",
" .reset_index() \\\n",
" .rename(columns={\"CASISTICA_LAVORAZIONE\":\"count\"})\n",
"\n",
"\n",
" #self-join the df to find every n-gram occurring in just one class\n",
" df_cp_ngram_mrg = df_cp_ngram.merge(df_cp_ngram, how=\"left\", on=ngram_col_name)\n",
" df_cp_ngram_mrg = df_cp_ngram_mrg.rename(columns={\"count_x\":\"ngram_occurrencies\"})\n",
"\n",
" df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE = df_cp_ngram_mrg.groupby([\"CASISTICA_MOTIVAZIONE_x\", ngram_col_name]).count()\n",
" \n",
" df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE = df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE.reset_index()\n",
" df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE = df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE[[\"CASISTICA_MOTIVAZIONE_x\", ngram_col_name, \"count_y\"]] \\\n",
" .rename(columns={\"count_y\":\"ngram_occurrencies_in_CASISTICA_MOTIVAZIONE\", \"CASISTICA_MOTIVAZIONE_x\": \"CASISTICA_MOTIVAZIONE\"})\n",
" \n",
" # find every n-gram occurring in just one class (it was groupedBy + count by [class,n-gram])\n",
" df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE = df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE[df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE[\"ngram_occurrencies_in_CASISTICA_MOTIVAZIONE\"] == 1]\n",
"\n",
" #merge the new DF to produce the final result\n",
" res = df_cp_ngram_mrg.merge(df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE, on=ngram_col_name)\n",
"\n",
" res = res[[\"CASISTICA_MOTIVAZIONE_x\", ngram_col_name, \"ngram_occurrencies\"]] #\"ngram_occurrencies_in_CASISTICA_MOTIVAZIONE\"\n",
" res = res.rename(columns={\"CASISTICA_MOTIVAZIONE_x\":\"CASISTICA_MOTIVAZIONE\"})\n",
"\n",
" res = res.sort_values(by = \"ngram_occurrencies\", ascending=False)\n",
"\n",
" #groupBy and merge the original DF to create the column \"record_per_class\", indicating how many record occurr in every class\n",
" df_grp_class = df.groupby(\"CASISTICA_MOTIVAZIONE\").count().reset_index()\n",
" df_grp_class = df_grp_class.rename(columns={\"Unnamed: 0\":\"record_per_class\"})\n",
" df_grp_class = df_grp_class[[\"CASISTICA_MOTIVAZIONE\", \"record_per_class\"]]\n",
"\n",
" res = res.merge(df_grp_class, on=\"CASISTICA_MOTIVAZIONE\")\n",
"\n",
" return res\n",
"\n",
"\n",
"gram_dim = 2\n",
"res= get_unique_ngrams(df, gram_dim)\n",
"\n",
"res.iloc[:25]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f05a5897",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CASISTICA_MOTIVAZIONE</th>\n",
" <th>NOTE_OPERATORE_split_3gram</th>\n",
" <th>count_placeholder</th>\n",
" <th>CASISTICA_MOTIVAZIONE_selfjoin</th>\n",
" <th>count_selfjoin</th>\n",
" <th>sum_selfjoin</th>\n",
" <th>tf_idf</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2627</th>\n",
" <td>Generica</td>\n",
" <td>(mai, stato, proprietario)</td>\n",
" <td>11</td>\n",
" <td>Periodo diverso</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>3.666667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2626</th>\n",
" <td>Generica</td>\n",
" <td>(mai, stato, proprietario)</td>\n",
" <td>11</td>\n",
" <td>No esenzione PH</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>3.666667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2625</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(manca, vers, allega)</td>\n",
" <td>11</td>\n",
" <td>Generica</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3.666667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2587</th>\n",
" <td>Furto non annotato</td>\n",
" <td>(annotata, pra, già)</td>\n",
" <td>8</td>\n",
" <td>Esportazione non annotata</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2588</th>\n",
" <td>Furto non annotato</td>\n",
" <td>(annotata, pra, già)</td>\n",
" <td>8</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2386</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(periodo, tributario, versamento)</td>\n",
" <td>4</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2335</th>\n",
" <td>Regione diversa</td>\n",
" <td>(versamento, favore, regione)</td>\n",
" <td>4</td>\n",
" <td>Periodo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2347</th>\n",
" <td>Furto non annotato</td>\n",
" <td>(furto, allega, denuncia)</td>\n",
" <td>4</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2356</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(vendita, bollo, dovuto)</td>\n",
" <td>4</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2384</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(versamento, allegato, relativo)</td>\n",
" <td>4</td>\n",
" <td>Vendita tardiva</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2379</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(periodo, tributario, già)</td>\n",
" <td>4</td>\n",
" <td>Periodo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2345</th>\n",
" <td>Furto non annotato</td>\n",
" <td>(denuncia, furto, mai)</td>\n",
" <td>4</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2302</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(allega, bollo, pagato)</td>\n",
" <td>4</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2340</th>\n",
" <td>Generica</td>\n",
" <td>(presa, carico, demolitore)</td>\n",
" <td>4</td>\n",
" <td>Fermo</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2315</th>\n",
" <td>No esenzione PH</td>\n",
" <td>(invalidità, idoneo, rilascio)</td>\n",
" <td>4</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2295</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(non, presente, bollo)</td>\n",
" <td>4</td>\n",
" <td>Vendita tardiva</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2296</th>\n",
" <td>Generica</td>\n",
" <td>(altra, regione, periodo)</td>\n",
" <td>4</td>\n",
" <td>Acquisto nel mese di rinnovo; concessionario</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2317</th>\n",
" <td>Generica</td>\n",
" <td>(nessun, versamento, presente)</td>\n",
" <td>4</td>\n",
" <td>Periodo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2316</th>\n",
" <td>Esenzione PH senza istanza</td>\n",
" <td>(presente, cliente, avvisato)</td>\n",
" <td>4</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2336</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(non, presente, ricevuta)</td>\n",
" <td>4</td>\n",
" <td>Ricevuta non valida</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2355</th>\n",
" <td>Generica</td>\n",
" <td>(domiciliazione, bancaria, attiva)</td>\n",
" <td>4</td>\n",
" <td>Domiciliazione tardiva</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2282</th>\n",
" <td>Generica</td>\n",
" <td>(visura, pra, risulta)</td>\n",
" <td>4</td>\n",
" <td>Vendita tardiva</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2280</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(veicolo, venduto, mese)</td>\n",
" <td>4</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2273</th>\n",
" <td>Generica</td>\n",
" <td>(bollo, dovuto, cliente)</td>\n",
" <td>4</td>\n",
" <td>Demolizione</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2277</th>\n",
" <td>Generica</td>\n",
" <td>(perdita, possesso, trascritta)</td>\n",
" <td>4</td>\n",
" <td>Sequestro non annotato</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2330</th>\n",
" <td>Regione diversa</td>\n",
" <td>(versamento, effettuato, favore)</td>\n",
" <td>4</td>\n",
" <td>Periodo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2383</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(utile, atto, di)</td>\n",
" <td>4</td>\n",
" <td>Acquisto nel mese di rinnovo; privato</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2272</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(non, congruo, versamento)</td>\n",
" <td>4</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2270</th>\n",
" <td>Generica</td>\n",
" <td>(pratica, lavorata, nuovo)</td>\n",
" <td>4</td>\n",
" <td>Periodo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2309</th>\n",
" <td>Generica</td>\n",
" <td>(veicolo, demolito, periodo)</td>\n",
" <td>4</td>\n",
" <td>Demolizione</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2279</th>\n",
" <td>Generica</td>\n",
" <td>(targa, errata, targa)</td>\n",
" <td>4</td>\n",
" <td>Regione diversa</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2304</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(presente, allega, versamenti)</td>\n",
" <td>4</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2645</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(vendita, tardivo, veicolo)</td>\n",
" <td>13</td>\n",
" <td>Acquisto nel mese di rinnovo; concessionario</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2646</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(vendita, tardivo, veicolo)</td>\n",
" <td>13</td>\n",
" <td>Acquisto nel mese di rinnovo; privato</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>4.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2607</th>\n",
" <td>Generica</td>\n",
" <td>(veicolo, targato, targa)</td>\n",
" <td>9</td>\n",
" <td>No esenzione PH</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>4.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2608</th>\n",
" <td>Generica</td>\n",
" <td>(veicolo, targato, targa)</td>\n",
" <td>9</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>4.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2463</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(regione, versamento, non)</td>\n",
" <td>5</td>\n",
" <td>Regione diversa</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2616</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(versamento, riferito, altra)</td>\n",
" <td>10</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2615</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(versamento, riferito, altra)</td>\n",
" <td>10</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2467</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(richiesta, versamento, non)</td>\n",
" <td>5</td>\n",
" <td>Esenzione PH senza istanza</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2462</th>\n",
" <td>Generica</td>\n",
" <td>(atto, altra, regione)</td>\n",
" <td>5</td>\n",
" <td>Acquisto nel mese di rinnovo; privato</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2390</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(bollo, dovuto, vendita)</td>\n",
" <td>5</td>\n",
" <td>Generica 2_non allegata documentazione</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2612</th>\n",
" <td>Furto non annotato</td>\n",
" <td>(perdita, possesso, furto)</td>\n",
" <td>10</td>\n",
" <td>Rientro in possesso</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2395</th>\n",
" <td>Generica</td>\n",
" <td>(rientro, possesso, data)</td>\n",
" <td>5</td>\n",
" <td>Rientro in possesso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2394</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(presente, aci/tabaccaio/agenzia, versamento)</td>\n",
" <td>5</td>\n",
" <td>Ricevuta non valida</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2476</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(allega, ricevuta, pagamento)</td>\n",
" <td>5</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2448</th>\n",
" <td>Generica</td>\n",
" <td>(dichiara, aver, pagato)</td>\n",
" <td>5</td>\n",
" <td>Vendita tardiva</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2446</th>\n",
" <td>Generica</td>\n",
" <td>(disabile, fiscalmente, carico)</td>\n",
" <td>5</td>\n",
" <td>No esenzione PH</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2420</th>\n",
" <td>Esenzione PH senza istanza</td>\n",
" <td>(disabile, respinta, manca)</td>\n",
" <td>5</td>\n",
" <td>No esenzione PH</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2611</th>\n",
" <td>Furto non annotato</td>\n",
" <td>(perdita, possesso, furto)</td>\n",
" <td>10</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>5.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2662</th>\n",
" <td>Generica</td>\n",
" <td>(cliente, dichiara, aver)</td>\n",
" <td>16</td>\n",
" <td>Vendita tardiva</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>5.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2664</th>\n",
" <td>Generica</td>\n",
" <td>(allega, carta, circolazione)</td>\n",
" <td>16</td>\n",
" <td>Esenzione PH senza istanza</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>5.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2665</th>\n",
" <td>Generica</td>\n",
" <td>(allega, carta, circolazione)</td>\n",
" <td>16</td>\n",
" <td>Esportazione non annotata</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>5.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2666</th>\n",
" <td>Generica</td>\n",
" <td>(allega, carta, circolazione)</td>\n",
" <td>16</td>\n",
" <td>Vendita tardiva</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>5.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2663</th>\n",
" <td>Generica</td>\n",
" <td>(cliente, dichiara, aver)</td>\n",
" <td>16</td>\n",
" <td>Esportazione non annotata</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>5.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2635</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(non, presente, versamento)</td>\n",
" <td>11</td>\n",
" <td>Regione diversa</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>5.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2636</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(non, presente, versamento)</td>\n",
" <td>11</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>5.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2483</th>\n",
" <td>Generica</td>\n",
" <td>(documentazione, allegata, targa)</td>\n",
" <td>6</td>\n",
" <td>Periodo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2639</th>\n",
" <td>Generica</td>\n",
" <td>(allega, certificato, storico)</td>\n",
" <td>12</td>\n",
" <td>Esenzione PH senza istanza</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2534</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(x, periodo, tributario)</td>\n",
" <td>6</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2512</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(versamento, allegato, anno)</td>\n",
" <td>6</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2533</th>\n",
" <td>Generica</td>\n",
" <td>(carta, circolazione, annotazione)</td>\n",
" <td>6</td>\n",
" <td>Esenzione PH senza istanza</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2518</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(pagamento, atto, di)</td>\n",
" <td>6</td>\n",
" <td>Acquisto nel mese di rinnovo; privato</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2482</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(non, presente, allegato)</td>\n",
" <td>6</td>\n",
" <td>Esenzione PH senza istanza</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2642</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(bollo, allegato, corrisponde)</td>\n",
" <td>12</td>\n",
" <td>Generica</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2644</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(bollo, pagato, x)</td>\n",
" <td>12</td>\n",
" <td>Regione diversa</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2643</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(bollo, pagato, x)</td>\n",
" <td>12</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2479</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(allegato, anno, manca)</td>\n",
" <td>6</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2652</th>\n",
" <td>Generica</td>\n",
" <td>(atto, data, certa)</td>\n",
" <td>13</td>\n",
" <td>Furto non annotato</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>6.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2651</th>\n",
" <td>Generica</td>\n",
" <td>(atto, data, certa)</td>\n",
" <td>13</td>\n",
" <td>Esportazione non annotata</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>6.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2683</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(presente, non, congruo)</td>\n",
" <td>27</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>6.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2680</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(versam, presente, non)</td>\n",
" <td>27</td>\n",
" <td>Versamento annullato</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>6.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2682</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(presente, non, congruo)</td>\n",
" <td>27</td>\n",
" <td>Versamento annullato</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>6.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2681</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(versam, presente, non)</td>\n",
" <td>27</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>6.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2737</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(atto, di, vendita)</td>\n",
" <td>152</td>\n",
" <td>Domiciliazione tardiva</td>\n",
" <td>1</td>\n",
" <td>22</td>\n",
" <td>6.909091</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2740</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(atto, di, vendita)</td>\n",
" <td>152</td>\n",
" <td>No minivoltura</td>\n",
" <td>1</td>\n",
" <td>22</td>\n",
" <td>6.909091</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2745</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(atto, di, vendita)</td>\n",
" <td>152</td>\n",
" <td>Acquisto nel mese di rinnovo; privato</td>\n",
" <td>7</td>\n",
" <td>22</td>\n",
" <td>6.909091</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2744</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(atto, di, vendita)</td>\n",
" <td>152</td>\n",
" <td>Periodo diverso</td>\n",
" <td>6</td>\n",
" <td>22</td>\n",
" <td>6.909091</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2743</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(atto, di, vendita)</td>\n",
" <td>152</td>\n",
" <td>Demolizione</td>\n",
" <td>2</td>\n",
" <td>22</td>\n",
" <td>6.909091</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2739</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(atto, di, vendita)</td>\n",
" <td>152</td>\n",
" <td>Generica 2_non allegata documentazione</td>\n",
" <td>1</td>\n",
" <td>22</td>\n",
" <td>6.909091</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2738</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(atto, di, vendita)</td>\n",
" <td>152</td>\n",
" <td>Esportazione non annotata</td>\n",
" <td>1</td>\n",
" <td>22</td>\n",
" <td>6.909091</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2742</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(atto, di, vendita)</td>\n",
" <td>152</td>\n",
" <td>Acquisto nel mese di rinnovo; concessionario</td>\n",
" <td>2</td>\n",
" <td>22</td>\n",
" <td>6.909091</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2741</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(atto, di, vendita)</td>\n",
" <td>152</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>22</td>\n",
" <td>6.909091</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2654</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(dovuto, versamento, non)</td>\n",
" <td>14</td>\n",
" <td>Perdita possesso con DS successiva</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2653</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(dovuto, versamento, non)</td>\n",
" <td>14</td>\n",
" <td>Esportazione non annotata</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2579</th>\n",
" <td>Generica</td>\n",
" <td>(allega, presa, carico)</td>\n",
" <td>7</td>\n",
" <td>Fermo</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2572</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(presente, allega, versamento)</td>\n",
" <td>7</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2576</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(corrisponde, periodo, richiesto)</td>\n",
" <td>7</td>\n",
" <td>Regione diversa</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2578</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(pagato, x, periodo)</td>\n",
" <td>7</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2566</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(ricevuta, allegata, riferita)</td>\n",
" <td>7</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2731</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(di, vendita, tardivo)</td>\n",
" <td>147</td>\n",
" <td>Domiciliazione tardiva</td>\n",
" <td>1</td>\n",
" <td>19</td>\n",
" <td>7.736842</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2733</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(di, vendita, tardivo)</td>\n",
" <td>147</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>19</td>\n",
" <td>7.736842</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2732</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(di, vendita, tardivo)</td>\n",
" <td>147</td>\n",
" <td>No minivoltura</td>\n",
" <td>1</td>\n",
" <td>19</td>\n",
" <td>7.736842</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2735</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(di, vendita, tardivo)</td>\n",
" <td>147</td>\n",
" <td>Periodo diverso</td>\n",
" <td>6</td>\n",
" <td>19</td>\n",
" <td>7.736842</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2734</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(di, vendita, tardivo)</td>\n",
" <td>147</td>\n",
" <td>Acquisto nel mese di rinnovo; concessionario</td>\n",
" <td>2</td>\n",
" <td>19</td>\n",
" <td>7.736842</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2736</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(di, vendita, tardivo)</td>\n",
" <td>147</td>\n",
" <td>Acquisto nel mese di rinnovo; privato</td>\n",
" <td>7</td>\n",
" <td>19</td>\n",
" <td>7.736842</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2730</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(di, vendita, tardivo)</td>\n",
" <td>147</td>\n",
" <td>Demolizione</td>\n",
" <td>1</td>\n",
" <td>19</td>\n",
" <td>7.736842</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2591</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(riferito, periodo, tributario)</td>\n",
" <td>8</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>8.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2589</th>\n",
" <td>Fermo</td>\n",
" <td>(fermo, amministrativo, fermo)</td>\n",
" <td>8</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>8.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2583</th>\n",
" <td>Furto non annotato</td>\n",
" <td>(furto, mai, trascritto)</td>\n",
" <td>8</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>8.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2586</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(atto, vendita, atto)</td>\n",
" <td>8</td>\n",
" <td>Generica 2_non allegata documentazione</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>8.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2590</th>\n",
" <td>Generica</td>\n",
" <td>(manca, carta, circolazione)</td>\n",
" <td>8</td>\n",
" <td>Esportazione non annotata</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>8.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2602</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(ricevuta, relativa, diverso)</td>\n",
" <td>9</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2610</th>\n",
" <td>Regione diversa</td>\n",
" <td>(regione, bollo, pagato)</td>\n",
" <td>9</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2609</th>\n",
" <td>Generica</td>\n",
" <td>(dichiara, aver, venduto)</td>\n",
" <td>9</td>\n",
" <td>Vendita tardiva</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2614</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(tardivo, veicolo, venduto)</td>\n",
" <td>10</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>10.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2621</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(allegato, corrisponde, periodo)</td>\n",
" <td>10</td>\n",
" <td>Regione diversa</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>10.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2613</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(dovuto, atto, di)</td>\n",
" <td>10</td>\n",
" <td>Periodo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>10.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2620</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(non, congruo, allega)</td>\n",
" <td>10</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>10.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2637</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(allega, versamento, anno)</td>\n",
" <td>11</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>11.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2624</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(allegata, ricevuta, relativa)</td>\n",
" <td>11</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>11.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2650</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(versamento, riferito, periodo)</td>\n",
" <td>13</td>\n",
" <td>Generica</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>13.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2661</th>\n",
" <td>Periodo diverso</td>\n",
" <td>(versamento, allegato, riferito)</td>\n",
" <td>15</td>\n",
" <td>Veicolo diverso</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>15.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2671</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(vendita, atto, di)</td>\n",
" <td>17</td>\n",
" <td>Generica 2_non allegata documentazione</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>17.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2693</th>\n",
" <td>Vendita tardiva</td>\n",
" <td>(vendita, tardivo, atto)</td>\n",
" <td>29</td>\n",
" <td>Acquisto nel mese di rinnovo; privato</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>29.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" CASISTICA_MOTIVAZIONE \\\n",
"2627 Generica \n",
"2626 Generica \n",
"2625 Periodo diverso \n",
"2587 Furto non annotato \n",
"2588 Furto non annotato \n",
"2386 Periodo diverso \n",
"2335 Regione diversa \n",
"2347 Furto non annotato \n",
"2356 Vendita tardiva \n",
"2384 Periodo diverso \n",
"2379 Vendita tardiva \n",
"2345 Furto non annotato \n",
"2302 Periodo diverso \n",
"2340 Generica \n",
"2315 No esenzione PH \n",
"2295 Periodo diverso \n",
"2296 Generica \n",
"2317 Generica \n",
"2316 Esenzione PH senza istanza \n",
"2336 Periodo diverso \n",
"2355 Generica \n",
"2282 Generica \n",
"2280 Vendita tardiva \n",
"2273 Generica \n",
"2277 Generica \n",
"2330 Regione diversa \n",
"2383 Vendita tardiva \n",
"2272 Periodo diverso \n",
"2270 Generica \n",
"2309 Generica \n",
"2279 Generica \n",
"2304 Periodo diverso \n",
"2645 Vendita tardiva \n",
"2646 Vendita tardiva \n",
"2607 Generica \n",
"2608 Generica \n",
"2463 Periodo diverso \n",
"2616 Periodo diverso \n",
"2615 Periodo diverso \n",
"2467 Periodo diverso \n",
"2462 Generica \n",
"2390 Vendita tardiva \n",
"2612 Furto non annotato \n",
"2395 Generica \n",
"2394 Periodo diverso \n",
"2476 Periodo diverso \n",
"2448 Generica \n",
"2446 Generica \n",
"2420 Esenzione PH senza istanza \n",
"2611 Furto non annotato \n",
"2662 Generica \n",
"2664 Generica \n",
"2665 Generica \n",
"2666 Generica \n",
"2663 Generica \n",
"2635 Periodo diverso \n",
"2636 Periodo diverso \n",
"2483 Generica \n",
"2639 Generica \n",
"2534 Periodo diverso \n",
"2512 Periodo diverso \n",
"2533 Generica \n",
"2518 Vendita tardiva \n",
"2482 Periodo diverso \n",
"2642 Periodo diverso \n",
"2644 Periodo diverso \n",
"2643 Periodo diverso \n",
"2479 Periodo diverso \n",
"2652 Generica \n",
"2651 Generica \n",
"2683 Periodo diverso \n",
"2680 Periodo diverso \n",
"2682 Periodo diverso \n",
"2681 Periodo diverso \n",
"2737 Vendita tardiva \n",
"2740 Vendita tardiva \n",
"2745 Vendita tardiva \n",
"2744 Vendita tardiva \n",
"2743 Vendita tardiva \n",
"2739 Vendita tardiva \n",
"2738 Vendita tardiva \n",
"2742 Vendita tardiva \n",
"2741 Vendita tardiva \n",
"2654 Vendita tardiva \n",
"2653 Vendita tardiva \n",
"2579 Generica \n",
"2572 Periodo diverso \n",
"2576 Periodo diverso \n",
"2578 Periodo diverso \n",
"2566 Periodo diverso \n",
"2731 Vendita tardiva \n",
"2733 Vendita tardiva \n",
"2732 Vendita tardiva \n",
"2735 Vendita tardiva \n",
"2734 Vendita tardiva \n",
"2736 Vendita tardiva \n",
"2730 Vendita tardiva \n",
"2591 Periodo diverso \n",
"2589 Fermo \n",
"2583 Furto non annotato \n",
"2586 Vendita tardiva \n",
"2590 Generica \n",
"2602 Periodo diverso \n",
"2610 Regione diversa \n",
"2609 Generica \n",
"2614 Vendita tardiva \n",
"2621 Periodo diverso \n",
"2613 Vendita tardiva \n",
"2620 Periodo diverso \n",
"2637 Periodo diverso \n",
"2624 Periodo diverso \n",
"2650 Periodo diverso \n",
"2661 Periodo diverso \n",
"2671 Vendita tardiva \n",
"2693 Vendita tardiva \n",
"\n",
" NOTE_OPERATORE_split_3gram count_placeholder \\\n",
"2627 (mai, stato, proprietario) 11 \n",
"2626 (mai, stato, proprietario) 11 \n",
"2625 (manca, vers, allega) 11 \n",
"2587 (annotata, pra, già) 8 \n",
"2588 (annotata, pra, già) 8 \n",
"2386 (periodo, tributario, versamento) 4 \n",
"2335 (versamento, favore, regione) 4 \n",
"2347 (furto, allega, denuncia) 4 \n",
"2356 (vendita, bollo, dovuto) 4 \n",
"2384 (versamento, allegato, relativo) 4 \n",
"2379 (periodo, tributario, già) 4 \n",
"2345 (denuncia, furto, mai) 4 \n",
"2302 (allega, bollo, pagato) 4 \n",
"2340 (presa, carico, demolitore) 4 \n",
"2315 (invalidità, idoneo, rilascio) 4 \n",
"2295 (non, presente, bollo) 4 \n",
"2296 (altra, regione, periodo) 4 \n",
"2317 (nessun, versamento, presente) 4 \n",
"2316 (presente, cliente, avvisato) 4 \n",
"2336 (non, presente, ricevuta) 4 \n",
"2355 (domiciliazione, bancaria, attiva) 4 \n",
"2282 (visura, pra, risulta) 4 \n",
"2280 (veicolo, venduto, mese) 4 \n",
"2273 (bollo, dovuto, cliente) 4 \n",
"2277 (perdita, possesso, trascritta) 4 \n",
"2330 (versamento, effettuato, favore) 4 \n",
"2383 (utile, atto, di) 4 \n",
"2272 (non, congruo, versamento) 4 \n",
"2270 (pratica, lavorata, nuovo) 4 \n",
"2309 (veicolo, demolito, periodo) 4 \n",
"2279 (targa, errata, targa) 4 \n",
"2304 (presente, allega, versamenti) 4 \n",
"2645 (vendita, tardivo, veicolo) 13 \n",
"2646 (vendita, tardivo, veicolo) 13 \n",
"2607 (veicolo, targato, targa) 9 \n",
"2608 (veicolo, targato, targa) 9 \n",
"2463 (regione, versamento, non) 5 \n",
"2616 (versamento, riferito, altra) 10 \n",
"2615 (versamento, riferito, altra) 10 \n",
"2467 (richiesta, versamento, non) 5 \n",
"2462 (atto, altra, regione) 5 \n",
"2390 (bollo, dovuto, vendita) 5 \n",
"2612 (perdita, possesso, furto) 10 \n",
"2395 (rientro, possesso, data) 5 \n",
"2394 (presente, aci/tabaccaio/agenzia, versamento) 5 \n",
"2476 (allega, ricevuta, pagamento) 5 \n",
"2448 (dichiara, aver, pagato) 5 \n",
"2446 (disabile, fiscalmente, carico) 5 \n",
"2420 (disabile, respinta, manca) 5 \n",
"2611 (perdita, possesso, furto) 10 \n",
"2662 (cliente, dichiara, aver) 16 \n",
"2664 (allega, carta, circolazione) 16 \n",
"2665 (allega, carta, circolazione) 16 \n",
"2666 (allega, carta, circolazione) 16 \n",
"2663 (cliente, dichiara, aver) 16 \n",
"2635 (non, presente, versamento) 11 \n",
"2636 (non, presente, versamento) 11 \n",
"2483 (documentazione, allegata, targa) 6 \n",
"2639 (allega, certificato, storico) 12 \n",
"2534 (x, periodo, tributario) 6 \n",
"2512 (versamento, allegato, anno) 6 \n",
"2533 (carta, circolazione, annotazione) 6 \n",
"2518 (pagamento, atto, di) 6 \n",
"2482 (non, presente, allegato) 6 \n",
"2642 (bollo, allegato, corrisponde) 12 \n",
"2644 (bollo, pagato, x) 12 \n",
"2643 (bollo, pagato, x) 12 \n",
"2479 (allegato, anno, manca) 6 \n",
"2652 (atto, data, certa) 13 \n",
"2651 (atto, data, certa) 13 \n",
"2683 (presente, non, congruo) 27 \n",
"2680 (versam, presente, non) 27 \n",
"2682 (presente, non, congruo) 27 \n",
"2681 (versam, presente, non) 27 \n",
"2737 (atto, di, vendita) 152 \n",
"2740 (atto, di, vendita) 152 \n",
"2745 (atto, di, vendita) 152 \n",
"2744 (atto, di, vendita) 152 \n",
"2743 (atto, di, vendita) 152 \n",
"2739 (atto, di, vendita) 152 \n",
"2738 (atto, di, vendita) 152 \n",
"2742 (atto, di, vendita) 152 \n",
"2741 (atto, di, vendita) 152 \n",
"2654 (dovuto, versamento, non) 14 \n",
"2653 (dovuto, versamento, non) 14 \n",
"2579 (allega, presa, carico) 7 \n",
"2572 (presente, allega, versamento) 7 \n",
"2576 (corrisponde, periodo, richiesto) 7 \n",
"2578 (pagato, x, periodo) 7 \n",
"2566 (ricevuta, allegata, riferita) 7 \n",
"2731 (di, vendita, tardivo) 147 \n",
"2733 (di, vendita, tardivo) 147 \n",
"2732 (di, vendita, tardivo) 147 \n",
"2735 (di, vendita, tardivo) 147 \n",
"2734 (di, vendita, tardivo) 147 \n",
"2736 (di, vendita, tardivo) 147 \n",
"2730 (di, vendita, tardivo) 147 \n",
"2591 (riferito, periodo, tributario) 8 \n",
"2589 (fermo, amministrativo, fermo) 8 \n",
"2583 (furto, mai, trascritto) 8 \n",
"2586 (atto, vendita, atto) 8 \n",
"2590 (manca, carta, circolazione) 8 \n",
"2602 (ricevuta, relativa, diverso) 9 \n",
"2610 (regione, bollo, pagato) 9 \n",
"2609 (dichiara, aver, venduto) 9 \n",
"2614 (tardivo, veicolo, venduto) 10 \n",
"2621 (allegato, corrisponde, periodo) 10 \n",
"2613 (dovuto, atto, di) 10 \n",
"2620 (non, congruo, allega) 10 \n",
"2637 (allega, versamento, anno) 11 \n",
"2624 (allegata, ricevuta, relativa) 11 \n",
"2650 (versamento, riferito, periodo) 13 \n",
"2661 (versamento, allegato, riferito) 15 \n",
"2671 (vendita, atto, di) 17 \n",
"2693 (vendita, tardivo, atto) 29 \n",
"\n",
" CASISTICA_MOTIVAZIONE_selfjoin count_selfjoin \\\n",
"2627 Periodo diverso 2 \n",
"2626 No esenzione PH 1 \n",
"2625 Generica 3 \n",
"2587 Esportazione non annotata 1 \n",
"2588 Generica 1 \n",
"2386 Generica 1 \n",
"2335 Periodo diverso 1 \n",
"2347 Generica 1 \n",
"2356 Generica 1 \n",
"2384 Vendita tardiva 1 \n",
"2379 Periodo diverso 1 \n",
"2345 Generica 1 \n",
"2302 Generica 1 \n",
"2340 Fermo 1 \n",
"2315 Generica 1 \n",
"2295 Vendita tardiva 1 \n",
"2296 Acquisto nel mese di rinnovo; concessionario 1 \n",
"2317 Periodo diverso 1 \n",
"2316 Generica 1 \n",
"2336 Ricevuta non valida 1 \n",
"2355 Domiciliazione tardiva 1 \n",
"2282 Vendita tardiva 1 \n",
"2280 Generica 1 \n",
"2273 Demolizione 1 \n",
"2277 Sequestro non annotato 1 \n",
"2330 Periodo diverso 1 \n",
"2383 Acquisto nel mese di rinnovo; privato 1 \n",
"2272 Veicolo diverso 1 \n",
"2270 Periodo diverso 1 \n",
"2309 Demolizione 1 \n",
"2279 Regione diversa 1 \n",
"2304 Veicolo diverso 1 \n",
"2645 Acquisto nel mese di rinnovo; concessionario 1 \n",
"2646 Acquisto nel mese di rinnovo; privato 2 \n",
"2607 No esenzione PH 1 \n",
"2608 Veicolo diverso 1 \n",
"2463 Regione diversa 1 \n",
"2616 Veicolo diverso 1 \n",
"2615 Generica 1 \n",
"2467 Esenzione PH senza istanza 1 \n",
"2462 Acquisto nel mese di rinnovo; privato 1 \n",
"2390 Generica 2_non allegata documentazione 1 \n",
"2612 Rientro in possesso 1 \n",
"2395 Rientro in possesso 1 \n",
"2394 Ricevuta non valida 1 \n",
"2476 Generica 1 \n",
"2448 Vendita tardiva 1 \n",
"2446 No esenzione PH 1 \n",
"2420 No esenzione PH 1 \n",
"2611 Generica 1 \n",
"2662 Vendita tardiva 1 \n",
"2664 Esenzione PH senza istanza 1 \n",
"2665 Esportazione non annotata 1 \n",
"2666 Vendita tardiva 1 \n",
"2663 Esportazione non annotata 2 \n",
"2635 Regione diversa 1 \n",
"2636 Veicolo diverso 1 \n",
"2483 Periodo diverso 1 \n",
"2639 Esenzione PH senza istanza 2 \n",
"2534 Generica 1 \n",
"2512 Generica 1 \n",
"2533 Esenzione PH senza istanza 1 \n",
"2518 Acquisto nel mese di rinnovo; privato 1 \n",
"2482 Esenzione PH senza istanza 1 \n",
"2642 Generica 2 \n",
"2644 Regione diversa 1 \n",
"2643 Generica 1 \n",
"2479 Generica 1 \n",
"2652 Furto non annotato 1 \n",
"2651 Esportazione non annotata 1 \n",
"2683 Veicolo diverso 3 \n",
"2680 Versamento annullato 1 \n",
"2682 Versamento annullato 1 \n",
"2681 Veicolo diverso 3 \n",
"2737 Domiciliazione tardiva 1 \n",
"2740 No minivoltura 1 \n",
"2745 Acquisto nel mese di rinnovo; privato 7 \n",
"2744 Periodo diverso 6 \n",
"2743 Demolizione 2 \n",
"2739 Generica 2_non allegata documentazione 1 \n",
"2738 Esportazione non annotata 1 \n",
"2742 Acquisto nel mese di rinnovo; concessionario 2 \n",
"2741 Veicolo diverso 1 \n",
"2654 Perdita possesso con DS successiva 1 \n",
"2653 Esportazione non annotata 1 \n",
"2579 Fermo 1 \n",
"2572 Veicolo diverso 1 \n",
"2576 Regione diversa 1 \n",
"2578 Generica 1 \n",
"2566 Veicolo diverso 1 \n",
"2731 Domiciliazione tardiva 1 \n",
"2733 Veicolo diverso 1 \n",
"2732 No minivoltura 1 \n",
"2735 Periodo diverso 6 \n",
"2734 Acquisto nel mese di rinnovo; concessionario 2 \n",
"2736 Acquisto nel mese di rinnovo; privato 7 \n",
"2730 Demolizione 1 \n",
"2591 Generica 1 \n",
"2589 Generica 1 \n",
"2583 Generica 1 \n",
"2586 Generica 2_non allegata documentazione 1 \n",
"2590 Esportazione non annotata 1 \n",
"2602 Veicolo diverso 1 \n",
"2610 Generica 1 \n",
"2609 Vendita tardiva 1 \n",
"2614 Generica 1 \n",
"2621 Regione diversa 1 \n",
"2613 Periodo diverso 1 \n",
"2620 Veicolo diverso 1 \n",
"2637 Veicolo diverso 1 \n",
"2624 Veicolo diverso 1 \n",
"2650 Generica 1 \n",
"2661 Veicolo diverso 1 \n",
"2671 Generica 2_non allegata documentazione 1 \n",
"2693 Acquisto nel mese di rinnovo; privato 1 \n",
"\n",
" sum_selfjoin tf_idf \n",
"2627 3 3.666667 \n",
"2626 3 3.666667 \n",
"2625 3 3.666667 \n",
"2587 2 4.000000 \n",
"2588 2 4.000000 \n",
"2386 1 4.000000 \n",
"2335 1 4.000000 \n",
"2347 1 4.000000 \n",
"2356 1 4.000000 \n",
"2384 1 4.000000 \n",
"2379 1 4.000000 \n",
"2345 1 4.000000 \n",
"2302 1 4.000000 \n",
"2340 1 4.000000 \n",
"2315 1 4.000000 \n",
"2295 1 4.000000 \n",
"2296 1 4.000000 \n",
"2317 1 4.000000 \n",
"2316 1 4.000000 \n",
"2336 1 4.000000 \n",
"2355 1 4.000000 \n",
"2282 1 4.000000 \n",
"2280 1 4.000000 \n",
"2273 1 4.000000 \n",
"2277 1 4.000000 \n",
"2330 1 4.000000 \n",
"2383 1 4.000000 \n",
"2272 1 4.000000 \n",
"2270 1 4.000000 \n",
"2309 1 4.000000 \n",
"2279 1 4.000000 \n",
"2304 1 4.000000 \n",
"2645 3 4.333333 \n",
"2646 3 4.333333 \n",
"2607 2 4.500000 \n",
"2608 2 4.500000 \n",
"2463 1 5.000000 \n",
"2616 2 5.000000 \n",
"2615 2 5.000000 \n",
"2467 1 5.000000 \n",
"2462 1 5.000000 \n",
"2390 1 5.000000 \n",
"2612 2 5.000000 \n",
"2395 1 5.000000 \n",
"2394 1 5.000000 \n",
"2476 1 5.000000 \n",
"2448 1 5.000000 \n",
"2446 1 5.000000 \n",
"2420 1 5.000000 \n",
"2611 2 5.000000 \n",
"2662 3 5.333333 \n",
"2664 3 5.333333 \n",
"2665 3 5.333333 \n",
"2666 3 5.333333 \n",
"2663 3 5.333333 \n",
"2635 2 5.500000 \n",
"2636 2 5.500000 \n",
"2483 1 6.000000 \n",
"2639 2 6.000000 \n",
"2534 1 6.000000 \n",
"2512 1 6.000000 \n",
"2533 1 6.000000 \n",
"2518 1 6.000000 \n",
"2482 1 6.000000 \n",
"2642 2 6.000000 \n",
"2644 2 6.000000 \n",
"2643 2 6.000000 \n",
"2479 1 6.000000 \n",
"2652 2 6.500000 \n",
"2651 2 6.500000 \n",
"2683 4 6.750000 \n",
"2680 4 6.750000 \n",
"2682 4 6.750000 \n",
"2681 4 6.750000 \n",
"2737 22 6.909091 \n",
"2740 22 6.909091 \n",
"2745 22 6.909091 \n",
"2744 22 6.909091 \n",
"2743 22 6.909091 \n",
"2739 22 6.909091 \n",
"2738 22 6.909091 \n",
"2742 22 6.909091 \n",
"2741 22 6.909091 \n",
"2654 2 7.000000 \n",
"2653 2 7.000000 \n",
"2579 1 7.000000 \n",
"2572 1 7.000000 \n",
"2576 1 7.000000 \n",
"2578 1 7.000000 \n",
"2566 1 7.000000 \n",
"2731 19 7.736842 \n",
"2733 19 7.736842 \n",
"2732 19 7.736842 \n",
"2735 19 7.736842 \n",
"2734 19 7.736842 \n",
"2736 19 7.736842 \n",
"2730 19 7.736842 \n",
"2591 1 8.000000 \n",
"2589 1 8.000000 \n",
"2583 1 8.000000 \n",
"2586 1 8.000000 \n",
"2590 1 8.000000 \n",
"2602 1 9.000000 \n",
"2610 1 9.000000 \n",
"2609 1 9.000000 \n",
"2614 1 10.000000 \n",
"2621 1 10.000000 \n",
"2613 1 10.000000 \n",
"2620 1 10.000000 \n",
"2637 1 11.000000 \n",
"2624 1 11.000000 \n",
"2650 1 13.000000 \n",
"2661 1 15.000000 \n",
"2671 1 17.000000 \n",
"2693 1 29.000000 "
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n=3\n",
"splitted_col_name = \"NOTE_OPERATORE_split_{}gram\".format(str(n))\n",
"df_cp=df.copy()\n",
"\n",
"#concatenate every string within the same class\n",
"df_cp = df_cp.sort_values(by=\"CASISTICA_MOTIVAZIONE\")\n",
"df_cp[\"cat\"] = df_cp.groupby(['CASISTICA_MOTIVAZIONE'])['NOTE_OPERATORE'].transform(lambda x : ' '.join(x))\n",
"df_cp[\"cat\"] = df_cp[\"cat\"].str.lower()\n",
"df_cp = df_cp.groupby([\"CASISTICA_MOTIVAZIONE\", \"cat\"]).count().reset_index()\n",
"\n",
"#make column as a list of words (1-gram), then create a column of n-grams\n",
"df_cp[splitted_col_name] = df_cp[\"cat\"].str.split(\" \").apply(lambda x: apply_ngram(x, n))\n",
"\n",
"#count the occurrences for every n-gram for every class\n",
"df_cp[\"count_placeholder\"] = 1\n",
"df_cp = df_cp[[\"CASISTICA_MOTIVAZIONE\", splitted_col_name, \"count_placeholder\"]].explode(splitted_col_name)\\\n",
" .groupby([\"CASISTICA_MOTIVAZIONE\", splitted_col_name]).count() \\\n",
" .reset_index() \\\n",
" .sort_values(by=\"count_placeholder\")\n",
"\n",
"#self-join to find which classes (except the currently considerated one) have a given n-gram, for every n-gram and every class\n",
"df_cp_selfjoin = df_cp[[\"CASISTICA_MOTIVAZIONE\", splitted_col_name, \"count_placeholder\"]].rename(columns={\"CASISTICA_MOTIVAZIONE\": \"CASISTICA_MOTIVAZIONE_selfjoin\", \"count_placeholder\":\"count_selfjoin\"})\n",
"df_cp = df_cp.merge(df_cp_selfjoin, on=splitted_col_name)\n",
"df_cp = df_cp[df_cp[\"CASISTICA_MOTIVAZIONE\"] != df_cp[\"CASISTICA_MOTIVAZIONE_selfjoin\"]]\n",
"\n",
"# find how many occurrences of a given n-gram exist in every other class (except the currently considerated one), for every n-gram and every class\n",
"df_cp_tfidf = df_cp[[\"CASISTICA_MOTIVAZIONE\", splitted_col_name, \"count_selfjoin\"]].groupby([\"CASISTICA_MOTIVAZIONE\", splitted_col_name]).sum(\"count_selfjoin\") \\\n",
" .rename(columns={\"count_selfjoin\": \"sum_selfjoin\"})\n",
"df_cp = df_cp.merge(df_cp_tfidf, on=[\"CASISTICA_MOTIVAZIONE\", splitted_col_name])\n",
"\n",
"#for every n-gram and every class, divide the occurrences of that n-gram in that class by the sum of the occurrences of that n-gram in every other class\n",
"df_cp[\"tf_idf\"] = df_cp[\"count_placeholder\"] / df_cp[\"sum_selfjoin\"]\n",
"\n",
"df_cp = df_cp.sort_values(by=\"tf_idf\")\n",
"\n",
"df_cp.iloc[-115:]\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}