v0.3
This commit is contained in:
parent
64698013ef
commit
10eecbf0c8
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
|
@ -0,0 +1,80 @@
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from stopWords import stopWrdList
|
||||||
|
|
||||||
|
def getTrnVect():
|
||||||
|
# code to get the trained vectors
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
str_trained_vect = open('trn_vect.vec', 'r').read().split('\n')
|
||||||
|
|
||||||
|
str_trained_vect.pop(len(str_trained_vect)-1)
|
||||||
|
|
||||||
|
|
||||||
|
trained_vect = []
|
||||||
|
for i in str_trained_vect:
|
||||||
|
trained_vect.append(yaml.load(i))
|
||||||
|
|
||||||
|
|
||||||
|
del str_trained_vect, i
|
||||||
|
|
||||||
|
return trained_vect
|
||||||
|
|
||||||
|
|
||||||
|
def classify_news(document):
|
||||||
|
# code to vectorize news to classify
|
||||||
|
|
||||||
|
from similarityMeasures import cos_sim
|
||||||
|
|
||||||
|
vect_to_classify = []
|
||||||
|
|
||||||
|
news = open(document, 'r').read()
|
||||||
|
|
||||||
|
vect_to_classify.append(news)
|
||||||
|
|
||||||
|
stop_words = stopWrdList()
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words, max_features=100)
|
||||||
|
|
||||||
|
X = vectorizer.fit_transform(vect_to_classify)
|
||||||
|
vector = X.toarray()
|
||||||
|
|
||||||
|
trained_vectors = getTrnVect()
|
||||||
|
|
||||||
|
# get dim
|
||||||
|
|
||||||
|
len_vector = len(vector[0])
|
||||||
|
len_train = len(trained_vectors[0])
|
||||||
|
|
||||||
|
vector = list(vector[0])
|
||||||
|
if len_train > len_vector:
|
||||||
|
for i in range(len_train - len_vector):
|
||||||
|
vector.append(0)
|
||||||
|
|
||||||
|
sim_vect = []
|
||||||
|
for i in trained_vectors:
|
||||||
|
sim_vect.append(cos_sim(vector, i))
|
||||||
|
|
||||||
|
|
||||||
|
maxi = max(sim_vect)
|
||||||
|
|
||||||
|
|
||||||
|
x = 0
|
||||||
|
for i in sim_vect:
|
||||||
|
if i == maxi:
|
||||||
|
y = x
|
||||||
|
x = x + 1
|
||||||
|
|
||||||
|
part_neu_vect = 'This note has neutral emotions and it is related with the party'
|
||||||
|
part_neg_vect = 'This note has negative emotions and it is related with the party'
|
||||||
|
part_pos_vect = 'This note has positive emotions and it is related with the party'
|
||||||
|
cont_neu_vect = 'This note has neutral emotions and it is related with the opposition'
|
||||||
|
cont_neg_vect = 'This note has negative emotions and it is related with the opposition'
|
||||||
|
cont_pos_vect = 'This note has positive emotions and it is related with the opposition'
|
||||||
|
neut_neu_vect = 'This note has neutral emotions and it is not particularly related a political party'
|
||||||
|
neut_neg_vect = 'This note has negative emotions and it is not particularly related a political party'
|
||||||
|
neut_pos_vect = 'This note has positive emotions and it is not particularly related a political party'
|
||||||
|
|
||||||
|
results = [part_neu_vect, part_neg_vect, part_pos_vect, cont_neu_vect, cont_neg_vect, cont_pos_vect, neut_neu_vect, neut_neg_vect, neut_pos_vect]
|
||||||
|
|
||||||
|
print(results[y])
|
10
main.py
10
main.py
|
@ -1,8 +1,8 @@
|
||||||
|
|
||||||
from newsTrain import trainVect, flagger
|
from newsTrain import saveTraining
|
||||||
|
from classify_news import classify_news
|
||||||
|
|
||||||
|
# saveTraining()
|
||||||
|
|
||||||
sert = trainVect()
|
classify_news('news_to_classify.txt')
|
||||||
|
classify_news('news2.txt')
|
||||||
for i in sert:
|
|
||||||
print(i)
|
|
||||||
|
|
44
newsTrain.py
44
newsTrain.py
|
@ -2,7 +2,6 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from stopWords import stopWrdList
|
from stopWords import stopWrdList
|
||||||
from retEmoDict import emoDic
|
from retEmoDict import emoDic
|
||||||
from clust import clustering
|
from clust import clustering
|
||||||
import operator
|
|
||||||
|
|
||||||
def trainPre(word_array, dict):
|
def trainPre(word_array, dict):
|
||||||
|
|
||||||
|
@ -178,7 +177,7 @@ def trainVect():
|
||||||
|
|
||||||
stop_words = stopWrdList()
|
stop_words = stopWrdList()
|
||||||
|
|
||||||
vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words)
|
vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words, max_features=100)
|
||||||
|
|
||||||
X = vectorizer.fit_transform(corpus)
|
X = vectorizer.fit_transform(corpus)
|
||||||
vector = X.toarray()
|
vector = X.toarray()
|
||||||
|
@ -237,6 +236,8 @@ def trainVect():
|
||||||
neut_neg_vect = [vector[x] for x in neut_neg_ind]
|
neut_neg_vect = [vector[x] for x in neut_neg_ind]
|
||||||
neut_pos_vect = [vector[x] for x in neut_pos_ind]
|
neut_pos_vect = [vector[x] for x in neut_pos_ind]
|
||||||
|
|
||||||
|
############################################ 1
|
||||||
|
|
||||||
len1 = len(part_neu_vect)
|
len1 = len(part_neu_vect)
|
||||||
if len1 != 0:
|
if len1 != 0:
|
||||||
for a in range(len1):
|
for a in range(len1):
|
||||||
|
@ -251,6 +252,8 @@ def trainVect():
|
||||||
else:
|
else:
|
||||||
part_neu_vect = []
|
part_neu_vect = []
|
||||||
|
|
||||||
|
############################################ 2
|
||||||
|
|
||||||
len1 = len(part_neg_vect)
|
len1 = len(part_neg_vect)
|
||||||
if len1 != 0:
|
if len1 != 0:
|
||||||
for a in range(len1):
|
for a in range(len1):
|
||||||
|
@ -258,11 +261,15 @@ def trainVect():
|
||||||
tmp = operate_on_Narray(part_neg_vect[0], tmp[a+1], lambda x, y: x + y)
|
tmp = operate_on_Narray(part_neg_vect[0], tmp[a+1], lambda x, y: x + y)
|
||||||
|
|
||||||
tmp = operate_on_Narray(part_neg_vect[0], tmp[a+1], lambda x, y: x / len1)
|
tmp = operate_on_Narray(part_neg_vect[0], tmp[a+1], lambda x, y: x / len1)
|
||||||
|
|
||||||
part_neg_vect = list(tmp)
|
part_neg_vect = list(tmp)
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
part_neg_vect = []
|
part_neg_vect = []
|
||||||
|
|
||||||
|
############################################ 3
|
||||||
|
|
||||||
len1 = len(part_pos_vect)
|
len1 = len(part_pos_vect)
|
||||||
if len1 != 0:
|
if len1 != 0:
|
||||||
for a in range(len1):
|
for a in range(len1):
|
||||||
|
@ -275,6 +282,8 @@ def trainVect():
|
||||||
else:
|
else:
|
||||||
part_pos_vect = []
|
part_pos_vect = []
|
||||||
|
|
||||||
|
############################################ 4
|
||||||
|
|
||||||
len1 = len(cont_neu_vect)
|
len1 = len(cont_neu_vect)
|
||||||
if len1 != 0:
|
if len1 != 0:
|
||||||
for a in range(len1):
|
for a in range(len1):
|
||||||
|
@ -287,6 +296,8 @@ def trainVect():
|
||||||
else:
|
else:
|
||||||
cont_neu_vect = []
|
cont_neu_vect = []
|
||||||
|
|
||||||
|
############################################ 5
|
||||||
|
|
||||||
len1 = len(cont_neg_vect)
|
len1 = len(cont_neg_vect)
|
||||||
if len1 != 0:
|
if len1 != 0:
|
||||||
for a in range(len1):
|
for a in range(len1):
|
||||||
|
@ -299,6 +310,8 @@ def trainVect():
|
||||||
else:
|
else:
|
||||||
cont_neg_vect = []
|
cont_neg_vect = []
|
||||||
|
|
||||||
|
############################################ 6
|
||||||
|
|
||||||
len1 = len(cont_pos_vect)
|
len1 = len(cont_pos_vect)
|
||||||
if len1 != 0:
|
if len1 != 0:
|
||||||
for a in range(len1):
|
for a in range(len1):
|
||||||
|
@ -311,6 +324,22 @@ def trainVect():
|
||||||
else:
|
else:
|
||||||
cont_pos_vect = []
|
cont_pos_vect = []
|
||||||
|
|
||||||
|
############################################ 7
|
||||||
|
|
||||||
|
len1 = len(neut_neu_vect)
|
||||||
|
if len1 != 0:
|
||||||
|
for a in range(len1):
|
||||||
|
tmp = neut_neu_vect[0]
|
||||||
|
tmp = operate_on_Narray(neut_neu_vect[0], tmp[a + 1], lambda x, y: x + y)
|
||||||
|
|
||||||
|
tmp = operate_on_Narray(neut_neu_vect[0], tmp[a + 1], lambda x, y: x / len1)
|
||||||
|
neut_neu_vect = list(tmp)
|
||||||
|
|
||||||
|
else:
|
||||||
|
neut_neu_vect = []
|
||||||
|
|
||||||
|
############################################ 8
|
||||||
|
|
||||||
len1 = len(neut_neg_vect)
|
len1 = len(neut_neg_vect)
|
||||||
if len1 != 0:
|
if len1 != 0:
|
||||||
for a in range(len1):
|
for a in range(len1):
|
||||||
|
@ -324,6 +353,8 @@ def trainVect():
|
||||||
else:
|
else:
|
||||||
neut_neg_vect = []
|
neut_neg_vect = []
|
||||||
|
|
||||||
|
############################################ 9
|
||||||
|
|
||||||
len1 = len(neut_pos_vect)
|
len1 = len(neut_pos_vect)
|
||||||
if len1 != 0:
|
if len1 != 0:
|
||||||
for a in range(len1):
|
for a in range(len1):
|
||||||
|
@ -341,3 +372,12 @@ def trainVect():
|
||||||
|
|
||||||
return [part_neu_vect, part_neg_vect, part_pos_vect, cont_neu_vect, cont_neg_vect, cont_pos_vect, neut_neu_vect, neut_neg_vect, neut_pos_vect]
|
return [part_neu_vect, part_neg_vect, part_pos_vect, cont_neu_vect, cont_neg_vect, cont_pos_vect, neut_neu_vect, neut_neg_vect, neut_pos_vect]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def saveTraining():
|
||||||
|
|
||||||
|
sert = trainVect()
|
||||||
|
trnVect = open('trn_vect.vec', 'w')
|
||||||
|
|
||||||
|
for i in sert:
|
||||||
|
trnVect.write(str(i) + '\n')
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
"""
|
||||||
|
Created on Mon Apr 17 09:34:40 2017
|
||||||
|
functions to calculate the similarity measure of two real vectors
|
||||||
|
@author: nlp
|
||||||
|
"""
|
||||||
|
# The cosine measure definition
|
||||||
|
def cos_sim(vect1, vect2):
|
||||||
|
if (len(vect1) == len(vect2)):
|
||||||
|
vect3 = []
|
||||||
|
for x in range(0, len(vect1)):
|
||||||
|
vect3.append(0)
|
||||||
|
|
||||||
|
for x in range(0, len(vect1)):
|
||||||
|
vect3[x] = vect1[x] * vect2[x]
|
||||||
|
|
||||||
|
n1 = norm(vect1)
|
||||||
|
n2 = norm(vect2)
|
||||||
|
|
||||||
|
return sum(vect3)/(n1*n2)
|
||||||
|
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Norm of vector
|
||||||
|
def norm(vect):
|
||||||
|
import math as mth
|
||||||
|
vect1 = []
|
||||||
|
for x in range(0, len(vect)):
|
||||||
|
vect1.append(0)
|
||||||
|
|
||||||
|
for x in range(0, len(vect)):
|
||||||
|
vect1[x] = vect[x] * vect[x]
|
||||||
|
|
||||||
|
return mth.sqrt(sum(vect1))
|
||||||
|
|
||||||
|
# Jacard similarity
|
||||||
|
def jac_sim(set_A,set_B):
|
||||||
|
if (str(type(set_A)) and str(type(set_B))) == "<class 'set'>":
|
||||||
|
if set_A == set_B:
|
||||||
|
return len(set_A & set_B)/len(set_A | set_B)
|
||||||
|
else:
|
||||||
|
return len(set_A & set_B)/len((set_A | set_B) - (set_A & set_B))
|
||||||
|
else:
|
||||||
|
print('One of the inputs not of type set')
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.029564870972714475, 0.0, 0.031681585307806945, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.085145070883776985, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.056251146398826481, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.073563944295505224, 0.0, 0.10753175525256822, 0.0, 0.0, 0.035183423033850185, 0.0, 0.0, 0.047392323741008761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.085145070883776985, 0.0]
|
||||||
|
[]
|
||||||
|
[]
|
||||||
|
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.070657270308847969, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.026519396344811225, 0.0, 0.028418068131773094, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.083744798839379686, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.058615615346713827, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07637428499714545, 0.0, 0.0, 0.0, 0.070657270308847969, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||||
|
[]
|
||||||
|
[0.10208139064065742, 0.0, 0.092124985572785187, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13300880960778988, 0.0, 0.0, 0.0, 0.046184567743872848, 0.13300880960778988, 0.049491179049355211, 0.0, 0.0, 0.0, 0.0, 0.096826048928060432, 0.12305240453991766, 0.0, 0.0, 0.0, 0.0, 0.14584484823505761, 0.0, 0.14584484823505761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.14584484823505761, 0.0, 0.0, 0.14584484823505761, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.087872356484313893, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11491742926792517, 0.0, 0.0, 0.0, 0.0, 0.0839900103007927, 0.0, 0.13300880960778988, 0.05496155170329832, 0.0, 0.0, 0.0, 0.12305240453991766, 0.0, 0.0, 0.0, 0.13300880960778988, 0.0, 0.0, 0.0, 0.0]
|
||||||
|
[0.0, 0.0, 0.044617190598828134, 0.0, 0.0, 0.0, 0.059595695487669048, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.022367717606043218, 0.0, 0.023969147510598116, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.064417697030794654, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.026618511934701741, 0.0, 0.0, 0.0, 0.0, 0.0, 0.064417697030794654, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||||
|
[0.18807877008848775, 0.0, 0.0, 0.24506066350553238, 0.24506066350553238, 0.0, 0.0, 0.0, 0.0, 0.24506066350553238, 0.0, 0.0, 0.0, 0.085092264553030247, 0.0, 0.091184495307262525, 0.0, 0.0, 0.0, 0.0, 0.17839612176741071, 0.22671659111470846, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.22671659111470846, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26871028605351621, 0.0, 0.24506066350553238, 0.0, 0.0, 0.0, 0.24506066350553238, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.16189948656287695, 0.0, 0.0, 0.24506066350553238, 0.24506066350553238, 0.0, 0.0, 0.0, 0.0, 0.21172839263647156, 0.0, 0.0, 0.0, 0.0, 0.1547464992194269, 0.0, 0.0, 0.10126332509418293, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||||
|
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.066242832843992142, 0.0, 0.0, 0.0, 0.0, 0.010488531607570335, 0.030206347527019731, 0.011239464200115866, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0279452444280088, 0.0279452444280088, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.033121416421996071, 0.033121416421996071, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.033121416421996071, 0.0279452444280088, 0.033121416421996071, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.033121416421996071, 0.033121416421996071, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.033121416421996071, 0.033121416421996071, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.033121416421996071, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030206347527019731, 0.0, 0.0, 0.0, 0.026097788677415228, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.012481787757288977, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030206347527019731, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0279452444280088]
|
Loading…
Reference in New Issue