32 lines
744 B
Python
32 lines
744 B
Python
from infBack import get_vect as gv
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
import numpy as np
|
|
|
|
|
|
def stopWrdList():
|
|
sw = open('stop.words')
|
|
prue = []
|
|
prue.append(sw.readlines())
|
|
return [l.strip('\n\r') for l in prue[0]]
|
|
|
|
|
|
voc = ["ine", "pri", "pan", "prd", "pt", "pvem", "verde", "movimiento", "ciudadano", "panal", "alianza", "morena", "partido", "encuentro", "social", "electoral"]
|
|
|
|
stop_words = stopWrdList()
|
|
|
|
dataVect = gv()
|
|
|
|
dataVect = np.array(dataVect)
|
|
|
|
corpus = dataVect[:, 2]
|
|
|
|
vectorizer = TfidfVectorizer(strip_accents='ascii', analyzer='word', stop_words=stop_words, vocabulary=voc)
|
|
|
|
X = vectorizer.fit_transform(corpus)
|
|
|
|
del dataVect, stop_words, vectorizer # , corpus
|
|
|
|
J = X.toarray()
|
|
|
|
print(J)
|