This commit is contained in:
Eddie 2017-12-04 12:01:46 -06:00
parent 64698013ef
commit 10eecbf0c8
6 changed files with 189 additions and 9 deletions

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

80
classify_news.py Normal file
View File

@ -0,0 +1,80 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from stopWords import stopWrdList
def getTrnVect():
# code to get the trained vectors
import yaml
str_trained_vect = open('trn_vect.vec', 'r').read().split('\n')
str_trained_vect.pop(len(str_trained_vect)-1)
trained_vect = []
for i in str_trained_vect:
trained_vect.append(yaml.load(i))
del str_trained_vect, i
return trained_vect
def classify_news(document):
# code to vectorize news to classify
from similarityMeasures import cos_sim
vect_to_classify = []
news = open(document, 'r').read()
vect_to_classify.append(news)
stop_words = stopWrdList()
vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words, max_features=100)
X = vectorizer.fit_transform(vect_to_classify)
vector = X.toarray()
trained_vectors = getTrnVect()
# get dim
len_vector = len(vector[0])
len_train = len(trained_vectors[0])
vector = list(vector[0])
if len_train > len_vector:
for i in range(len_train - len_vector):
vector.append(0)
sim_vect = []
for i in trained_vectors:
sim_vect.append(cos_sim(vector, i))
maxi = max(sim_vect)
x = 0
for i in sim_vect:
if i == maxi:
y = x
x = x + 1
part_neu_vect = 'This note has neutral emotions and it is related with the party'
part_neg_vect = 'This note has negative emotions and it is related with the party'
part_pos_vect = 'This note has positive emotions and it is related with the party'
cont_neu_vect = 'This note has neutral emotions and it is related with the opposition'
cont_neg_vect = 'This note has negative emotions and it is related with the opposition'
cont_pos_vect = 'This note has positive emotions and it is related with the opposition'
neut_neu_vect = 'This note has neutral emotions and it is not particularly related a political party'
neut_neg_vect = 'This note has negative emotions and it is not particularly related a political party'
neut_pos_vect = 'This note has positive emotions and it is not particularly related a political party'
results = [part_neu_vect, part_neg_vect, part_pos_vect, cont_neu_vect, cont_neg_vect, cont_pos_vect, neut_neu_vect, neut_neg_vect, neut_pos_vect]
print(results[y])

10
main.py
View File

@ -1,8 +1,8 @@
from newsTrain import trainVect, flagger from newsTrain import saveTraining
from classify_news import classify_news
# saveTraining()
sert = trainVect() classify_news('news_to_classify.txt')
classify_news('news2.txt')
for i in sert:
print(i)

View File

@ -2,7 +2,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from stopWords import stopWrdList from stopWords import stopWrdList
from retEmoDict import emoDic from retEmoDict import emoDic
from clust import clustering from clust import clustering
import operator
def trainPre(word_array, dict): def trainPre(word_array, dict):
@ -178,7 +177,7 @@ def trainVect():
stop_words = stopWrdList() stop_words = stopWrdList()
vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words) vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words, max_features=100)
X = vectorizer.fit_transform(corpus) X = vectorizer.fit_transform(corpus)
vector = X.toarray() vector = X.toarray()
@ -237,6 +236,8 @@ def trainVect():
neut_neg_vect = [vector[x] for x in neut_neg_ind] neut_neg_vect = [vector[x] for x in neut_neg_ind]
neut_pos_vect = [vector[x] for x in neut_pos_ind] neut_pos_vect = [vector[x] for x in neut_pos_ind]
############################################ 1
len1 = len(part_neu_vect) len1 = len(part_neu_vect)
if len1 != 0: if len1 != 0:
for a in range(len1): for a in range(len1):
@ -251,18 +252,24 @@ def trainVect():
else: else:
part_neu_vect = [] part_neu_vect = []
############################################ 2
len1 = len(part_neg_vect) len1 = len(part_neg_vect)
if len1 != 0: if len1 != 0:
for a in range(len1): for a in range(len1):
tmp = part_neg_vect[0] tmp = part_neg_vect[0]
tmp = operate_on_Narray(part_neg_vect[0], tmp[a + 1], lambda x, y: x + y) tmp = operate_on_Narray(part_neg_vect[0], tmp[a+1], lambda x, y: x + y)
tmp = operate_on_Narray(part_neg_vect[0], tmp[a+1], lambda x, y: x / len1)
tmp = operate_on_Narray(part_neg_vect[0], tmp[a + 1], lambda x, y: x / len1)
part_neg_vect = list(tmp) part_neg_vect = list(tmp)
else: else:
part_neg_vect = [] part_neg_vect = []
############################################ 3
len1 = len(part_pos_vect) len1 = len(part_pos_vect)
if len1 != 0: if len1 != 0:
for a in range(len1): for a in range(len1):
@ -275,6 +282,8 @@ def trainVect():
else: else:
part_pos_vect = [] part_pos_vect = []
############################################ 4
len1 = len(cont_neu_vect) len1 = len(cont_neu_vect)
if len1 != 0: if len1 != 0:
for a in range(len1): for a in range(len1):
@ -287,6 +296,8 @@ def trainVect():
else: else:
cont_neu_vect = [] cont_neu_vect = []
############################################ 5
len1 = len(cont_neg_vect) len1 = len(cont_neg_vect)
if len1 != 0: if len1 != 0:
for a in range(len1): for a in range(len1):
@ -299,6 +310,8 @@ def trainVect():
else: else:
cont_neg_vect = [] cont_neg_vect = []
############################################ 6
len1 = len(cont_pos_vect) len1 = len(cont_pos_vect)
if len1 != 0: if len1 != 0:
for a in range(len1): for a in range(len1):
@ -311,6 +324,22 @@ def trainVect():
else: else:
cont_pos_vect = [] cont_pos_vect = []
############################################ 7
len1 = len(neut_neu_vect)
if len1 != 0:
for a in range(len1):
tmp = neut_neu_vect[0]
tmp = operate_on_Narray(neut_neu_vect[0], tmp[a + 1], lambda x, y: x + y)
tmp = operate_on_Narray(neut_neu_vect[0], tmp[a + 1], lambda x, y: x / len1)
neut_neu_vect = list(tmp)
else:
neut_neu_vect = []
############################################ 8
len1 = len(neut_neg_vect) len1 = len(neut_neg_vect)
if len1 != 0: if len1 != 0:
for a in range(len1): for a in range(len1):
@ -324,6 +353,8 @@ def trainVect():
else: else:
neut_neg_vect = [] neut_neg_vect = []
############################################ 9
len1 = len(neut_pos_vect) len1 = len(neut_pos_vect)
if len1 != 0: if len1 != 0:
for a in range(len1): for a in range(len1):
@ -341,3 +372,12 @@ def trainVect():
return [part_neu_vect, part_neg_vect, part_pos_vect, cont_neu_vect, cont_neg_vect, cont_pos_vect, neut_neu_vect, neut_neg_vect, neut_pos_vect] return [part_neu_vect, part_neg_vect, part_pos_vect, cont_neu_vect, cont_neg_vect, cont_pos_vect, neut_neu_vect, neut_neg_vect, neut_pos_vect]
def saveTraining():
sert = trainVect()
trnVect = open('trn_vect.vec', 'w')
for i in sert:
trnVect.write(str(i) + '\n')

45
similarityMeasures.py Executable file
View File

@ -0,0 +1,45 @@
"""
Created on Mon Apr 17 09:34:40 2017
functions to calculate the similarity measure of two real vectors
@author: nlp
"""
# The cosine measure definition
def cos_sim(vect1, vect2):
if (len(vect1) == len(vect2)):
vect3 = []
for x in range(0, len(vect1)):
vect3.append(0)
for x in range(0, len(vect1)):
vect3[x] = vect1[x] * vect2[x]
n1 = norm(vect1)
n2 = norm(vect2)
return sum(vect3)/(n1*n2)
else:
return 0
# Norm of vector
def norm(vect):
import math as mth
vect1 = []
for x in range(0, len(vect)):
vect1.append(0)
for x in range(0, len(vect)):
vect1[x] = vect[x] * vect[x]
return mth.sqrt(sum(vect1))
# Jacard similarity
def jac_sim(set_A,set_B):
if (str(type(set_A)) and str(type(set_B))) == "<class 'set'>":
if set_A == set_B:
return len(set_A & set_B)/len(set_A | set_B)
else:
return len(set_A & set_B)/len((set_A | set_B) - (set_A & set_B))
else:
print('One of the inputs not of type set')

9
trn_vect.vec Normal file
View File

@ -0,0 +1,9 @@
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.029564870972714475, 0.0, 0.031681585307806945, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.085145070883776985, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.056251146398826481, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.073563944295505224, 0.0, 0.10753175525256822, 0.0, 0.0, 0.035183423033850185, 0.0, 0.0, 0.047392323741008761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.085145070883776985, 0.0]
[]
[]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.070657270308847969, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.026519396344811225, 0.0, 0.028418068131773094, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.083744798839379686, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.058615615346713827, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07637428499714545, 0.0, 0.0, 0.0, 0.070657270308847969, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[]
[0.10208139064065742, 0.0, 0.092124985572785187, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13300880960778988, 0.0, 0.0, 0.0, 0.046184567743872848, 0.13300880960778988, 0.049491179049355211, 0.0, 0.0, 0.0, 0.0, 0.096826048928060432, 0.12305240453991766, 0.0, 0.0, 0.0, 0.0, 0.14584484823505761, 0.0, 0.14584484823505761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.14584484823505761, 0.0, 0.0, 0.14584484823505761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.087872356484313893, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11491742926792517, 0.0, 0.0, 0.0, 0.0, 0.0839900103007927, 0.0, 0.13300880960778988, 0.05496155170329832, 0.0, 0.0, 0.0, 0.12305240453991766, 0.0, 0.0, 0.0, 0.13300880960778988, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.044617190598828134, 0.0, 0.0, 0.0, 0.059595695487669048, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.022367717606043218, 0.0, 0.023969147510598116, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.064417697030794654, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.026618511934701741, 0.0, 0.0, 0.0, 0.0, 0.0, 0.064417697030794654, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.18807877008848775, 0.0, 0.0, 0.24506066350553238, 0.24506066350553238, 0.0, 0.0, 0.0, 0.0, 0.24506066350553238, 0.0, 0.0, 0.0, 0.085092264553030247, 0.0, 0.091184495307262525, 0.0, 0.0, 0.0, 0.0, 0.17839612176741071, 0.22671659111470846, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.22671659111470846, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.24506066350553238, 0.0, 0.0, 0.0, 0.24506066350553238, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16189948656287695, 0.0, 0.0, 0.24506066350553238, 0.24506066350553238, 0.0, 0.0, 0.0, 0.0, 0.21172839263647156, 0.0, 0.0, 0.0, 0.0, 0.1547464992194269, 0.0, 0.0, 0.10126332509418293, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.066242832843992142, 0.0, 0.0, 0.0, 0.0, 0.010488531607570335, 0.030206347527019731, 0.011239464200115866, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0279452444280088, 0.0279452444280088, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.033121416421996071, 0.033121416421996071, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.033121416421996071, 0.0279452444280088, 0.033121416421996071, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.033121416421996071, 0.033121416421996071, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.033121416421996071, 0.033121416421996071, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030206347527019731, 0.0, 0.0, 0.0, 0.026097788677415228, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.012481787757288977, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030206347527019731, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0279452444280088]