基于机器学习方法(决策树、贝叶斯分类器、支持向量机),对中文评论进行情感分析。
使用的语料库为谭松波发布的中文情感挖掘语料,包含2000个负面评论和2000个正面评论,每个评论单独存放在一个文件中。可从 https://pan.baidu.com/s/11MIkstHNxOb5lN7Ggg1iMA 下载。
import os
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from sklearn import tree
import scipy.sparse as sp
def read_data(file_dir):
data = []
for file in os.listdir(file_dir):
with open(file_dir+'/'+file, mode='r',encoding='UTF-8') as f:
data.append(f.read())
return data
comments_dir_pos = 'ChnSentiCorp情感分析酒店评论/正面'
comments_dir_neg = 'ChnSentiCorp情感分析酒店评论/负面'
docs_pos = read_data(comments_dir_pos)
docs_neg = read_data(comments_dir_neg)
docs = docs_pos + docs_neg
sent = [1 for i in range(0,2000)] + [-1 for i in range(0,2000)]
countVectorizer = CountVectorizer(binary=True, tokenizer=lambda doc : jieba.cut(doc))
matrix = countVectorizer.fit_transform(docs)
train = sp.vstack((matrix[0:1600],matrix[2000:3600]))
test = sp.vstack((matrix[1600:2000],matrix[3600:4000]))
train_sent = sent[0:1600] + sent[2000:3600]
test_sent = sent[1600:2000]+sent[3600:4000]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train, train_sent)
print('决策树情感分析精度:%.2f' % clf.score(test, test_sent))
clf = BernoulliNB()
clf.fit(train, train_sent)
print('朴素贝叶斯情感分析精度:%.2f' % clf.score(test, test_sent))
clf = svm.SVC(kernel='linear')
clf.fit(train, train_sent)
print('支持向量机情感分析精度:%.2f' % clf.score(test, test_sent))