2017-10-30 22:32:07 +00:00
|
|
|
from infBack import get_vect as gv
|
2017-11-07 15:04:18 +00:00
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
2017-11-28 01:14:38 +00:00
|
|
|
from stopWords import stopWrdList
|
2017-10-30 22:32:07 +00:00
|
|
|
import numpy as np
|
2017-11-07 15:04:18 +00:00
|
|
|
|
2017-10-30 22:32:07 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
def clustering():
|
2017-10-30 22:32:07 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
# This are the relevant news cue words
|
|
|
|
voc = ["ine", "pri", "pan", "prd", "pt", "pvem", "verde", "movimiento", "ciudadano", "panal", "alianza", "morena", "partido", "encuentro", "social", "electoral"]
|
2017-10-30 22:32:07 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
stop_words = stopWrdList()
|
2017-11-07 15:04:18 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
dataVect = gv()
|
2017-10-30 22:32:07 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
dataVect = np.array(dataVect)
|
2017-10-30 22:32:07 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
corpus = dataVect[:, 2]
|
2017-10-30 22:32:07 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words, vocabulary=voc)
|
2017-10-30 22:32:07 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
X = vectorizer.fit_transform(corpus)
|
2017-10-30 22:32:07 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
del dataVect, stop_words, vectorizer # , corpus
|
2017-10-30 22:32:07 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
J = X.toarray()
|
2017-10-30 22:32:07 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
# The indexes are extracted to obtain only the relevant news from the general corpus
|
2017-10-30 22:32:07 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
index = []
|
2017-11-10 17:17:27 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
for x in range(0, len(J)):
|
|
|
|
if sum(J[x]) != 0:
|
|
|
|
index.append(x)
|
2017-11-10 17:17:27 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
index = tuple(index)
|
2017-11-10 17:17:27 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
electCorp = [corpus[x] for x in index]
|
2017-11-10 17:17:27 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
del corpus
|
2017-11-10 17:17:27 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
# This section of the code processes the political party news in order to give a emotional classification
|
2017-11-10 17:17:27 +00:00
|
|
|
|
2017-11-28 01:14:38 +00:00
|
|
|
temp = []
|
|
|
|
|
|
|
|
for i in electCorp:
|
|
|
|
temp.append(i.split(' '))
|
|
|
|
|
|
|
|
return temp
|