Merge pull request #5 from EddieCueto/fedora25_work

Fedora25 work
This commit is contained in:
Eddie Cueto-Mendoza 2017-11-27 17:33:21 -06:00 committed by GitHub
commit 71715a021c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 18 additions and 27 deletions

View File

@ -1,10 +1,7 @@
from infBack import get_vect as gv from infBack import get_vect as gv
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import cluster
from matplotlib import pyplot
import numpy as np
def stopWrdList(): def stopWrdList():
sw = open('stop.words') sw = open('stop.words')
@ -13,6 +10,8 @@ def stopWrdList():
return [l.strip('\n\r') for l in prue[0]] return [l.strip('\n\r') for l in prue[0]]
voc = ["ine", "pri", "pan", "prd", "pt", "pvem", "verde", "movimiento", "ciudadano", "panal", "alianza", "morena", "partido", "encuentro", "social", "electoral"]
stop_words = stopWrdList() stop_words = stopWrdList()
dataVect = gv() dataVect = gv()
@ -21,36 +20,26 @@ dataVect = np.array(dataVect)
corpus = dataVect[:, 2] corpus = dataVect[:, 2]
vectorizer = CountVectorizer(stop_words=stop_words) vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words, vocabulary=voc)
transformer = TfidfTransformer(smooth_idf=False)
X = vectorizer.fit_transform(corpus) X = vectorizer.fit_transform(corpus)
del dataVect, corpus, stop_words del dataVect, stop_words, vectorizer # , corpus
J = X.toarray() J = X.toarray()
tf_idf = transformer.fit_transform(J) # print(J)
tf_idf_matrix = tf_idf.toarray() index = []
k = 2 for x in range(0, len(J)):
kmeans = cluster.KMeans(n_clusters=k) if sum(J[x]) != 0:
kmeans.fit(J) index.append(x)
labels = kmeans.labels_ index = tuple(index)
centroids = kmeans.cluster_centers_
for i in range(k): electCorp = [corpus[x] for x in index]
# select only data observations with cluster label == i
ds = J[np.where(labels == i)]
# plot the data observations
pyplot.plot(ds[:,0],ds[:,1],'o')
# plot the centroids
lines = pyplot.plot(centroids[i, 0], centroids[i, 1], 'kx')
# make the centroid x's bigger
pyplot.setp(lines, ms=15.0)
pyplot.setp(lines, mew=2.0)
pyplot.show()
print(X.toarray()) del corpus
print(electCorp)

View File

@ -27,6 +27,8 @@ def get_vect():
return impDat return impDat
# print(len(get_vect()))
# this section of the code show how to extract relevant data from the dictionaries # this section of the code show how to extract relevant data from the dictionaries
""" """