diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/classify_news.py b/classify_news.py new file mode 100644 index 0000000..260814d --- /dev/null +++ b/classify_news.py @@ -0,0 +1,80 @@ +from sklearn.feature_extraction.text import TfidfVectorizer +from stopWords import stopWrdList + +def getTrnVect(): + # code to get the trained vectors + + import yaml + + str_trained_vect = open('trn_vect.vec', 'r').read().split('\n') + + str_trained_vect.pop(len(str_trained_vect)-1) + + + trained_vect = [] + for i in str_trained_vect: + trained_vect.append(yaml.load(i)) + + + del str_trained_vect, i + + return trained_vect + + +def classify_news(document): + # code to vectorize news to classify + + from similarityMeasures import cos_sim + + vect_to_classify = [] + + news = open(document, 'r').read() + + vect_to_classify.append(news) + + stop_words = stopWrdList() + + vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words, max_features=100) + + X = vectorizer.fit_transform(vect_to_classify) + vector = X.toarray() + + trained_vectors = getTrnVect() + + # get dim + + len_vector = len(vector[0]) + len_train = len(trained_vectors[0]) + + vector = list(vector[0]) + if len_train > len_vector: + for i in range(len_train - len_vector): + vector.append(0) + + sim_vect = [] + for i in trained_vectors: + sim_vect.append(cos_sim(vector, i)) + + + maxi = max(sim_vect) + + + x = 0 + for i in sim_vect: + if i == maxi: + y = x + x = x + 1 + + part_neu_vect = 'This note has neutral emotions and it is related with the party' + part_neg_vect = 'This note has negative emotions and it is related with the party' + part_pos_vect = 'This note has positive emotions and it is related with the party' + cont_neu_vect = 'This note has neutral emotions and it is related with the opposition' + cont_neg_vect = 'This note has negative emotions and it is related with the opposition' + cont_pos_vect = 'This note has positive emotions and it is related with the opposition' + neut_neu_vect = 'This note has neutral emotions and it is not particularly related a political party' + neut_neg_vect = 'This note has negative emotions and it is not particularly related a political party' + neut_pos_vect = 'This note has positive emotions and it is not particularly related a political party' + + results = [part_neu_vect, part_neg_vect, part_pos_vect, cont_neu_vect, cont_neg_vect, cont_pos_vect, neut_neu_vect, neut_neg_vect, neut_pos_vect] + + print(results[y]) diff --git a/main.py b/main.py index 76c9ef9..d1f8c1a 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,8 @@ -from newsTrain import trainVect, flagger +from newsTrain import saveTraining +from classify_news import classify_news +# saveTraining() -sert = trainVect() - -for i in sert: - print(i) +classify_news('news_to_classify.txt') +classify_news('news2.txt') diff --git a/newsTrain.py b/newsTrain.py index 236aa05..386c028 100644 --- a/newsTrain.py +++ b/newsTrain.py @@ -2,7 +2,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer from stopWords import stopWrdList from retEmoDict import emoDic from clust import clustering -import operator def trainPre(word_array, dict): @@ -178,7 +177,7 @@ def trainVect(): stop_words = stopWrdList() - vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words) + vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words, max_features=100) X = vectorizer.fit_transform(corpus) vector = X.toarray() @@ -237,6 +236,8 @@ def trainVect(): neut_neg_vect = [vector[x] for x in neut_neg_ind] neut_pos_vect = [vector[x] for x in neut_pos_ind] +############################################ 1 + len1 = len(part_neu_vect) if len1 != 0: for a in range(len1): @@ -251,18 +252,24 @@ def trainVect(): else: part_neu_vect = [] +############################################ 2 + len1 = len(part_neg_vect) if len1 != 0: for a in range(len1): tmp = part_neg_vect[0] - tmp = operate_on_Narray(part_neg_vect[0], tmp[a + 1], lambda x, y: x + y) + tmp = operate_on_Narray(part_neg_vect[0], tmp[a+1], lambda x, y: x + y) + + tmp = operate_on_Narray(part_neg_vect[0], tmp[a+1], lambda x, y: x / len1) - tmp = operate_on_Narray(part_neg_vect[0], tmp[a + 1], lambda x, y: x / len1) part_neg_vect = list(tmp) + else: part_neg_vect = [] +############################################ 3 + len1 = len(part_pos_vect) if len1 != 0: for a in range(len1): @@ -275,6 +282,8 @@ def trainVect(): else: part_pos_vect = [] +############################################ 4 + len1 = len(cont_neu_vect) if len1 != 0: for a in range(len1): @@ -287,6 +296,8 @@ def trainVect(): else: cont_neu_vect = [] +############################################ 5 + len1 = len(cont_neg_vect) if len1 != 0: for a in range(len1): @@ -299,6 +310,8 @@ def trainVect(): else: cont_neg_vect = [] +############################################ 6 + len1 = len(cont_pos_vect) if len1 != 0: for a in range(len1): @@ -311,6 +324,22 @@ def trainVect(): else: cont_pos_vect = [] +############################################ 7 + + len1 = len(neut_neu_vect) + if len1 != 0: + for a in range(len1): + tmp = neut_neu_vect[0] + tmp = operate_on_Narray(neut_neu_vect[0], tmp[a + 1], lambda x, y: x + y) + + tmp = operate_on_Narray(neut_neu_vect[0], tmp[a + 1], lambda x, y: x / len1) + neut_neu_vect = list(tmp) + + else: + neut_neu_vect = [] + +############################################ 8 + len1 = len(neut_neg_vect) if len1 != 0: for a in range(len1): @@ -324,6 +353,8 @@ def trainVect(): else: neut_neg_vect = [] +############################################ 9 + len1 = len(neut_pos_vect) if len1 != 0: for a in range(len1): @@ -341,3 +372,12 @@ def trainVect(): return [part_neu_vect, part_neg_vect, part_pos_vect, cont_neu_vect, cont_neg_vect, cont_pos_vect, neut_neu_vect, neut_neg_vect, neut_pos_vect] + + +def saveTraining(): + + sert = trainVect() + trnVect = open('trn_vect.vec', 'w') + + for i in sert: + trnVect.write(str(i) + '\n') diff --git a/similarityMeasures.py b/similarityMeasures.py new file mode 100755 index 0000000..a66d86a --- /dev/null +++ b/similarityMeasures.py @@ -0,0 +1,45 @@ +""" +Created on Mon Apr 17 09:34:40 2017 +functions to calculate the similarity measure of two real vectors +@author: nlp +""" +# The cosine measure definition +def cos_sim(vect1, vect2): + if (len(vect1) == len(vect2)): + vect3 = [] + for x in range(0, len(vect1)): + vect3.append(0) + + for x in range(0, len(vect1)): + vect3[x] = vect1[x] * vect2[x] + + n1 = norm(vect1) + n2 = norm(vect2) + + return sum(vect3)/(n1*n2) + + else: + return 0 + +# Norm of vector +def norm(vect): + import math as mth + vect1 = [] + for x in range(0, len(vect)): + vect1.append(0) + + for x in range(0, len(vect)): + vect1[x] = vect[x] * vect[x] + + return mth.sqrt(sum(vect1)) + +# Jacard similarity +def jac_sim(set_A,set_B): + if (str(type(set_A)) and str(type(set_B))) == "": + if set_A == set_B: + return len(set_A & set_B)/len(set_A | set_B) + else: + return len(set_A & set_B)/len((set_A | set_B) - (set_A & set_B)) + else: + print('One of the inputs not of type set') + \ No newline at end of file diff --git a/trn_vect.vec b/trn_vect.vec new file mode 100644 index 0000000..6f0fa67 --- /dev/null +++ b/trn_vect.vec @@ -0,0 +1,9 @@ +[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.029564870972714475, 0.0, 0.031681585307806945, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.085145070883776985, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.056251146398826481, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.073563944295505224, 0.0, 0.10753175525256822, 0.0, 0.0, 0.035183423033850185, 0.0, 0.0, 0.047392323741008761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.085145070883776985, 0.0] +[] +[] +[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.070657270308847969, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.026519396344811225, 0.0, 0.028418068131773094, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.083744798839379686, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.058615615346713827, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07637428499714545, 0.0, 0.0, 0.0, 0.070657270308847969, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] +[] +[0.10208139064065742, 0.0, 0.092124985572785187, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13300880960778988, 0.0, 0.0, 0.0, 0.046184567743872848, 0.13300880960778988, 0.049491179049355211, 0.0, 0.0, 0.0, 0.0, 0.096826048928060432, 0.12305240453991766, 0.0, 0.0, 0.0, 0.0, 0.14584484823505761, 0.0, 0.14584484823505761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.14584484823505761, 0.0, 0.0, 0.14584484823505761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.087872356484313893, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11491742926792517, 0.0, 0.0, 0.0, 0.0, 0.0839900103007927, 0.0, 0.13300880960778988, 0.05496155170329832, 0.0, 0.0, 0.0, 0.12305240453991766, 0.0, 0.0, 0.0, 0.13300880960778988, 0.0, 0.0, 0.0, 0.0] +[0.0, 0.0, 0.044617190598828134, 0.0, 0.0, 0.0, 0.059595695487669048, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.022367717606043218, 0.0, 0.023969147510598116, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.064417697030794654, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.026618511934701741, 0.0, 0.0, 0.0, 0.0, 0.0, 0.064417697030794654, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] +[0.18807877008848775, 0.0, 0.0, 0.24506066350553238, 0.24506066350553238, 0.0, 0.0, 0.0, 0.0, 0.24506066350553238, 0.0, 0.0, 0.0, 0.085092264553030247, 0.0, 0.091184495307262525, 0.0, 0.0, 0.0, 0.0, 0.17839612176741071, 0.22671659111470846, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.22671659111470846, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.24506066350553238, 0.0, 0.0, 0.0, 0.24506066350553238, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16189948656287695, 0.0, 0.0, 0.24506066350553238, 0.24506066350553238, 0.0, 0.0, 0.0, 0.0, 0.21172839263647156, 0.0, 0.0, 0.0, 0.0, 0.1547464992194269, 0.0, 0.0, 0.10126332509418293, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] +[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.066242832843992142, 0.0, 0.0, 0.0, 0.0, 0.010488531607570335, 0.030206347527019731, 0.011239464200115866, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0279452444280088, 0.0279452444280088, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.033121416421996071, 0.033121416421996071, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.033121416421996071, 0.0279452444280088, 0.033121416421996071, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.033121416421996071, 0.033121416421996071, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.033121416421996071, 0.033121416421996071, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030206347527019731, 0.0, 0.0, 0.0, 0.026097788677415228, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.012481787757288977, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030206347527019731, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0279452444280088]