In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

import re
import string
from stop_words import get_stop_words

from imblearn.over_sampling import RandomOverSampler

import spacy

from nltk import ngrams



In [2]:
pd.set_option("display.max_colwidth", 255)
pd.set_option('display.max_rows', 255)

In [3]:
df = pd.read_excel("training-nltk-mapping3.xlsx")
cnt = df.groupby("CASISTICA_MOTIVAZIONE").count() \
.rename(columns={"Unnamed: 0": "count"})["count"]

In [None]:
def prune_infrequent_classes(input_df: pd.DataFrame):
    card_thresh = 10
    cnt = input_df[["CASISTICA_MOTIVAZIONE","NOTE_OPERATORE"]].groupby(["CASISTICA_MOTIVAZIONE"]).count().rename({"NOTE_OPERATORE":"count"},axis=1)
    res = input_df.merge(cnt, on="CASISTICA_MOTIVAZIONE")
    res = res[res["count"] >= card_thresh]
    res = res.drop("count", axis=1)
    return res


In [11]:
def unique_str(l:list[str]):
    return list(set(l))

def apply_ngram(l:list[str], n:int):
    return list(ngrams(l, n))

def join_string_list(l:list[tuple[str]]):
    return [" ".join(list(i)) for i in l]

In [None]:
def get_unique_ngrams(df: pd.DataFrame, n:int):
    df_cp = df[["CASISTICA_LAVORAZIONE","NOTE_OPERATORE","CASISTICA_MOTIVAZIONE"]]

    #make all strings lowercase
    df_cp["NOTE_OPERATORE_lower"] = df_cp["NOTE_OPERATORE"].str.lower()

    #make column as a list of words (1-gram)
    splitted_col_name = "NOTE_OPERATORE_split_{}gram".format(str(n))
    df_cp["NOTE_OPERATORE_split_1gram"] = df_cp["NOTE_OPERATORE_lower"].str.split(" ")

    #if n!=1 make a column as a list of n-grams
    if(n!=1):
        df_cp[splitted_col_name] = df_cp["NOTE_OPERATORE_split_1gram"].apply(lambda x: apply_ngram(x, n))

    ngram_col_name = "NOTE_OPERATORE_{}gram_unique_words".format(str(n))

    #make sure every n-gram in every row is not repeated
    df_cp[ngram_col_name] = df_cp[splitted_col_name].apply(lambda x: unique_str(x))

    #concat every word composing a n-gram, transforming the column from a list[tuple[str]] to list[str]
    if(n!=1):
        df_cp[ngram_col_name] = df_cp[ngram_col_name].apply(lambda x: join_string_list(x))


    #explode the df, to make a row for every n-gram - then groupBy [class, n-gram] to find how many records contain every n-gram
    df_cp_ngram = df_cp[[ngram_col_name, "CASISTICA_MOTIVAZIONE", "CASISTICA_LAVORAZIONE"]].explode(ngram_col_name) \
        .groupby(["CASISTICA_MOTIVAZIONE",ngram_col_name]).count()\
        .sort_values(by=["CASISTICA_MOTIVAZIONE", "CASISTICA_LAVORAZIONE"], ascending=[True,False])\
        .reset_index() \
        .rename(columns={"CASISTICA_LAVORAZIONE":"count"})


    #self-join the df to find every n-gram occurring in just one class
    df_cp_ngram_mrg = df_cp_ngram.merge(df_cp_ngram, how="left", on=ngram_col_name)
    df_cp_ngram_mrg = df_cp_ngram_mrg.rename(columns={"count_x":"ngram_occurrencies"})

    df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE = df_cp_ngram_mrg.groupby(["CASISTICA_MOTIVAZIONE_x", ngram_col_name]).count()
    
    df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE = df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE.reset_index()
    df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE = df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE[["CASISTICA_MOTIVAZIONE_x", ngram_col_name, "count_y"]] \
        .rename(columns={"count_y":"ngram_occurrencies_in_CASISTICA_MOTIVAZIONE", "CASISTICA_MOTIVAZIONE_x": "CASISTICA_MOTIVAZIONE"})
    
    # find every n-gram occurring in just one class (it was groupedBy + count by [class,n-gram])
    df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE = df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE[df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE["ngram_occurrencies_in_CASISTICA_MOTIVAZIONE"] == 1]

    #merge the new DF to produce the final result
    res = df_cp_ngram_mrg.merge(df_cp_ngram_mrg_uniqueCASISTICAMOTIVAZIONE, on=ngram_col_name)

    res = res[["CASISTICA_MOTIVAZIONE_x", ngram_col_name, "ngram_occurrencies"]] #"ngram_occurrencies_in_CASISTICA_MOTIVAZIONE"
    res = res.rename(columns={"CASISTICA_MOTIVAZIONE_x":"CASISTICA_MOTIVAZIONE"})

    res = res.sort_values(by = "ngram_occurrencies", ascending=False)

    #groupBy and merge the original DF to create the column "record_per_class", indicating how many record occurr in every class
    df_grp_class = df.groupby("CASISTICA_MOTIVAZIONE").count().reset_index()
    df_grp_class = df_grp_class.rename(columns={"Unnamed: 0":"record_per_class"})
    df_grp_class = df_grp_class[["CASISTICA_MOTIVAZIONE", "record_per_class"]]

    res = res.merge(df_grp_class, on="CASISTICA_MOTIVAZIONE")

    return res


gram_dim = 2
res= get_unique_ngrams(df, gram_dim)

res.iloc[:25]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cp["NOTE_OPERATORE_lower"] = df_cp["NOTE_OPERATORE"].str.lower()


Unnamed: 0,CASISTICA_MOTIVAZIONE,NOTE_OPERATORE_2gram_unique_words,ngram_occurrencies,record_per_class
0,Periodo diverso,allega vers,28,831
1,Generica,veicolo intestato,15,2763
2,Generica,essere mai,14,2763
3,Periodo diverso,periodi tributari,14,831
4,Periodo diverso,scansiona versamento,14,831
5,Furto non annotato,furto trascritto,12,116
6,Periodo diverso,relativa vers.,12,831
7,Periodo diverso,versamento periodo,12,831
8,Periodo diverso,relativa anno,11,831
9,Generica,dichiara vendita,10,2763


In [None]:
n=3
splitted_col_name = "NOTE_OPERATORE_split_{}gram".format(str(n))
df_cp=df.copy()

#concatenate every string within the same class
df_cp = df_cp.sort_values(by="CASISTICA_MOTIVAZIONE")
df_cp["cat"] = df_cp.groupby(['CASISTICA_MOTIVAZIONE'])['NOTE_OPERATORE'].transform(lambda x : ' '.join(x))
df_cp["cat"] = df_cp["cat"].str.lower()
df_cp = df_cp.groupby(["CASISTICA_MOTIVAZIONE", "cat"]).count().reset_index()

#make column as a list of words (1-gram), then create a column of n-grams
df_cp[splitted_col_name] = df_cp["cat"].str.split(" ").apply(lambda x: apply_ngram(x, n))

#count the occurrences for every n-gram for every class
df_cp["count_placeholder"] = 1
df_cp = df_cp[["CASISTICA_MOTIVAZIONE", splitted_col_name, "count_placeholder"]].explode(splitted_col_name)\
    .groupby(["CASISTICA_MOTIVAZIONE", splitted_col_name]).count() \
    .reset_index() \
    .sort_values(by="count_placeholder")

#self-join to find which classes (except the currently considerated one) have a given n-gram, for every n-gram and every class
df_cp_selfjoin = df_cp[["CASISTICA_MOTIVAZIONE", splitted_col_name, "count_placeholder"]].rename(columns={"CASISTICA_MOTIVAZIONE": "CASISTICA_MOTIVAZIONE_selfjoin", "count_placeholder":"count_selfjoin"})
df_cp = df_cp.merge(df_cp_selfjoin, on=splitted_col_name)
df_cp = df_cp[df_cp["CASISTICA_MOTIVAZIONE"] != df_cp["CASISTICA_MOTIVAZIONE_selfjoin"]]

# find how many occurrences of a given n-gram exist in every other class (except the currently considerated one), for every n-gram and every class
df_cp_tfidf = df_cp[["CASISTICA_MOTIVAZIONE", splitted_col_name, "count_selfjoin"]].groupby(["CASISTICA_MOTIVAZIONE", splitted_col_name]).sum("count_selfjoin") \
    .rename(columns={"count_selfjoin": "sum_selfjoin"})
df_cp = df_cp.merge(df_cp_tfidf, on=["CASISTICA_MOTIVAZIONE", splitted_col_name])

#for every n-gram and every class, divide the occurrences of that n-gram in that class by the sum of the occurrences of that n-gram in every other class
df_cp["tf_idf"] = df_cp["count_placeholder"] / df_cp["sum_selfjoin"]

df_cp = df_cp.sort_values(by="tf_idf")

df_cp.iloc[-115:]


Unnamed: 0,CASISTICA_MOTIVAZIONE,NOTE_OPERATORE_split_3gram,count_placeholder,CASISTICA_MOTIVAZIONE_selfjoin,count_selfjoin,sum_selfjoin,tf_idf
2627,Generica,"(mai, stato, proprietario)",11,Periodo diverso,2,3,3.666667
2626,Generica,"(mai, stato, proprietario)",11,No esenzione PH,1,3,3.666667
2625,Periodo diverso,"(manca, vers, allega)",11,Generica,3,3,3.666667
2587,Furto non annotato,"(annotata, pra, già)",8,Esportazione non annotata,1,2,4.0
2588,Furto non annotato,"(annotata, pra, già)",8,Generica,1,2,4.0
2386,Periodo diverso,"(periodo, tributario, versamento)",4,Generica,1,1,4.0
2335,Regione diversa,"(versamento, favore, regione)",4,Periodo diverso,1,1,4.0
2347,Furto non annotato,"(furto, allega, denuncia)",4,Generica,1,1,4.0
2356,Vendita tardiva,"(vendita, bollo, dovuto)",4,Generica,1,1,4.0
2384,Periodo diverso,"(versamento, allegato, relativo)",4,Vendita tardiva,1,1,4.0
