small change to the model
This commit is contained in:
parent
e286326c9a
commit
f9830e61d2
39
clust.py
39
clust.py
|
@ -1,10 +1,7 @@
|
||||||
from infBack import get_vect as gv
|
from infBack import get_vect as gv
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.feature_extraction.text import TfidfTransformer
|
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
|
||||||
from sklearn import cluster
|
|
||||||
from matplotlib import pyplot
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
def stopWrdList():
|
def stopWrdList():
|
||||||
sw = open('stop.words')
|
sw = open('stop.words')
|
||||||
|
@ -13,6 +10,8 @@ def stopWrdList():
|
||||||
return [l.strip('\n\r') for l in prue[0]]
|
return [l.strip('\n\r') for l in prue[0]]
|
||||||
|
|
||||||
|
|
||||||
|
voc = ["ine", "pri", "pan", "prd", "pt", "pvem", "verde", "movimiento", "ciudadano", "panal", "alianza", "morena", "partido", "encuentro", "social", "electoral"]
|
||||||
|
|
||||||
stop_words = stopWrdList()
|
stop_words = stopWrdList()
|
||||||
|
|
||||||
dataVect = gv()
|
dataVect = gv()
|
||||||
|
@ -21,36 +20,12 @@ dataVect = np.array(dataVect)
|
||||||
|
|
||||||
corpus = dataVect[:, 2]
|
corpus = dataVect[:, 2]
|
||||||
|
|
||||||
vectorizer = CountVectorizer(stop_words=stop_words)
|
vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words, vocabulary=voc)
|
||||||
transformer = TfidfTransformer(smooth_idf=False)
|
|
||||||
|
|
||||||
X = vectorizer.fit_transform(corpus)
|
X = vectorizer.fit_transform(corpus)
|
||||||
|
|
||||||
del dataVect, corpus, stop_words
|
del dataVect, stop_words, vectorizer # , corpus
|
||||||
|
|
||||||
J = X.toarray()
|
J = X.toarray()
|
||||||
|
|
||||||
tf_idf = transformer.fit_transform(J)
|
print(J)
|
||||||
|
|
||||||
tf_idf_matrix = tf_idf.toarray()
|
|
||||||
|
|
||||||
k = 2
|
|
||||||
kmeans = cluster.KMeans(n_clusters=k)
|
|
||||||
kmeans.fit(J)
|
|
||||||
|
|
||||||
labels = kmeans.labels_
|
|
||||||
centroids = kmeans.cluster_centers_
|
|
||||||
|
|
||||||
for i in range(k):
|
|
||||||
# select only data observations with cluster label == i
|
|
||||||
ds = J[np.where(labels == i)]
|
|
||||||
# plot the data observations
|
|
||||||
pyplot.plot(ds[:, 0], ds[:, 1], 'o')
|
|
||||||
# plot the centroids
|
|
||||||
lines = pyplot.plot(centroids[i, 0], centroids[i, 1], 'kx')
|
|
||||||
# make the centroid x's bigger
|
|
||||||
pyplot.setp(lines, ms=15.0)
|
|
||||||
pyplot.setp(lines, mew=2.0)
|
|
||||||
pyplot.show()
|
|
||||||
|
|
||||||
print(X.toarray())
|
|
||||||
|
|
|
@ -27,6 +27,8 @@ def get_vect():
|
||||||
return impDat
|
return impDat
|
||||||
|
|
||||||
|
|
||||||
|
# print(len(get_vect()))
|
||||||
|
|
||||||
|
|
||||||
# this section of the code show how to extract relevant data from the dictionaries
|
# this section of the code show how to extract relevant data from the dictionaries
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue