import jieba |
from sklearn.feature_extraction.text import CountVectorizer |
corpus = [ |
"我喜欢使用Python编程语言" , |
"Python是一种强大的编程语言" , |
"Python编程语言被广泛使用" |
] |
# 使用分词进行中文分词 |
corpus_seg = [] |
for sentence in corpus: |
seg_list = jieba.cut(sentence) |
corpus_seg.append( " " .join(seg_list)) |
# 使用CountVectorizer进行词频统计 |
vectorizer = CountVectorizer() |
X = vectorizer.fit_transform(corpus_seg) |
#并输出词汇表和词频矩阵 |
print (vectorizer.get_feature_names()) |
print (X.toarray()) |
#['python', '一种', '使用', '编程', '语言', '强大', '广泛', '喜欢', '被'] |
#[[1 0 1 1 1 0 0 1 0] |
# [1 1 0 1 1 1 0 0 0] |
# [1 0 0 1 1 0 1 0 1]] |