Merge pull request #5 from EddieCueto/fedora25_work

Fedora25 work
2017-11-27 17:33:21 -06:00 · 2017-11-27 17:33:21 -06:00 · 71715a021c
commit 71715a021c
parent 2c81e015ca 39377b0bf9
2 changed files with 18 additions and 27 deletions
--- a/clust.py
+++ b/clust.py
@ -1,10 +1,7 @@
 from infBack import get_vect as gv
+from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy as np
-from sklearn.feature_extraction.text import TfidfTransformer
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn import cluster
-from matplotlib import pyplot
-import numpy as np
+

 def stopWrdList():
    sw = open('stop.words')
@ -13,6 +10,8 @@ def stopWrdList():
    return [l.strip('\n\r') for l in prue[0]]


+voc = ["ine", "pri", "pan", "prd", "pt", "pvem", "verde", "movimiento", "ciudadano", "panal", "alianza", "morena", "partido", "encuentro", "social", "electoral"]
+
 stop_words = stopWrdList()

 dataVect = gv()
@ -21,36 +20,26 @@ dataVect = np.array(dataVect)

 corpus = dataVect[:, 2]

-vectorizer = CountVectorizer(stop_words=stop_words)
-transformer = TfidfTransformer(smooth_idf=False)
+vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words, vocabulary=voc)

 X = vectorizer.fit_transform(corpus)

-del dataVect, corpus, stop_words
+del dataVect, stop_words, vectorizer  # , corpus

 J = X.toarray()

-tf_idf = transformer.fit_transform(J)
+# print(J)

-tf_idf_matrix = tf_idf.toarray()
+index = []

-k = 2
-kmeans = cluster.KMeans(n_clusters=k)
-kmeans.fit(J)
+for x in range(0, len(J)):
+    if sum(J[x]) != 0:
+        index.append(x)

-labels = kmeans.labels_
-centroids = kmeans.cluster_centers_
+index = tuple(index)

-for i in range(k):
-    # select only data observations with cluster label == i
-    ds = J[np.where(labels == i)]
-    # plot the data observations
-    pyplot.plot(ds[:,0],ds[:,1],'o')
-    # plot the centroids
-    lines = pyplot.plot(centroids[i, 0], centroids[i, 1], 'kx')
-    # make the centroid x's bigger
-    pyplot.setp(lines, ms=15.0)
-    pyplot.setp(lines, mew=2.0)
-pyplot.show()
+electCorp = [corpus[x] for x in index]

-print(X.toarray())
+del corpus
+
+print(electCorp)
--- a/infBack.py
+++ b/infBack.py
@ -27,6 +27,8 @@ def get_vect():
    return impDat


+# print(len(get_vect()))
+

 # this section of the code show how to extract relevant data from the dictionaries
 """