Keras 情感分析

参考

数据集:https://github.com/aespresso/chinese_sentiment

1. 处理

1.1. 统一序列的维度

处理文本序列时,维度必须统一。文本A分词之后有200个token,文本B分词之后有1000个token,你必须统一成一个维度,例如限定序列长度是500。文本A不到500个token,用0填充,文本B就截取一部分

1
2
3
4
5
6
# 求出所有序列的长度
num_tokens = [len(tokens) for tokens in train_tokens]
num_tokens = np.array(num_tokens)
# 序列长度统一成 长度的平均值 +2个标准差
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)

2. 代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
导包
"""
import numpy as np
import matplotlib.pyplot as plt
from keras.utils.vis_utils import plot_model
import re
import jieba
from gensim.models import KeyedVectors
import os
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from sklearn.model_selection import train_test_split

"""
加载词向量模型
"""
cn_model = KeyedVectors.load_word2vec_format(
'C:/Users/Administrator/Desktop/project/pyproject/pytest/data/sgns.literature.word', binary=False)

embedding_dim = 300

"""
加载数据
"""
pos_txts = os.listdir('pos')
neg_txts = os.listdir('neg')
# 存储所有评价,每例评价为一条string
train_texts_orig = []
for i in range(len(pos_txts)):
with open('pos/' + pos_txts[i], 'r', errors='ignore') as f:
text = f.read().strip()
train_texts_orig.append(text)
f.close()
for i in range(len(neg_txts)):
with open('neg/' + neg_txts[i], 'r', errors='ignore') as f:
text = f.read().strip()
train_texts_orig.append(text)
f.close()

"""
分词和tokenize
"""
train_tokens = []
for text in train_texts_orig:
# 去掉标点
text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", text)
# 结巴分词
cut = jieba.cut(text)
# 结巴分词的输出结果为一个生成器
# 把生成器转换为list
cut_list = [i for i in cut]
for i, word in enumerate(cut_list):
try:
# 将词转换为索引index
cut_list[i] = cn_model.vocab[word].index
except KeyError:
# 如果词不在字典中,则输出0
cut_list[i] = 0
train_tokens.append(cut_list)

"""
索引长度标准化
"""
num_tokens = [len(tokens) for tokens in train_tokens]
num_tokens = np.array(num_tokens)

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)

"""
准备Embedding Matrix
"""

num_words = 200000
embedding_matrix = np.zeros((num_words, embedding_dim))
for i in range(num_words):
embedding_matrix[i, :] = cn_model[cn_model.index2word[i]]
embedding_matrix = embedding_matrix.astype('float32')

"""
padding(填充)和truncating(修剪)
"""
train_pad = pad_sequences(train_tokens, maxlen=max_tokens,
padding='pre', truncating='pre')

train_pad[train_pad >= num_words] = 0

# 准备target向量(训练集label),前2000样本为1,后2000为0
train_target = np.concatenate((np.ones(2000), np.zeros(2000)))

X_train, X_test, y_train, y_test = train_test_split(train_pad,
train_target,
test_size=0.1,
random_state=12)

"""
构建网络
"""
model = Sequential([
Embedding(num_words,
embedding_dim,
weights=[embedding_matrix],
input_length=max_tokens,
trainable=False),
Bidirectional(LSTM(units=32, return_sequences=True)),
LSTM(units=16, return_sequences=False),
Dense(1, activation='sigmoid')
])

optimizer = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',
optimizer=optimizer,
metrics=['accuracy'])

"""
绘制网络结构
"""
plot_model(model, to_file="model.png", show_shapes=True, show_layer_names="False", rankdir="TB")
img = plt.imread("model.png")
plt.imshow(img)
plt.show()

"""
存储
"""
# 建立一个权重的存储点
path_checkpoint = 'sentiment_checkpoint.keras'
checkpoint = ModelCheckpoint(filepath=path_checkpoint, monitor='val_loss',
verbose=1, save_weights_only=True,
save_best_only=True)
# 尝试加载已训练模型
try:
model.load_weights(path_checkpoint)
except Exception as e:
print(e)

"""
训练
"""
# 定义early stoping如果3个epoch内validation loss没有改善则停止训练
earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
# 自动降低learning rate
lr_reduction = ReduceLROnPlateau(monitor='val_loss',
factor=0.1, min_lr=1e-5, patience=0,
verbose=1)
# 定义callback函数
callbacks = [earlystopping, checkpoint, lr_reduction]
# 开始训练
model.fit(X_train, y_train,
validation_split=0.1,
epochs=10,
batch_size=128,
callbacks=callbacks)

"""
评估
"""
# 用模型评估 训练集
loss, accuracy = model.evaluate(X_train, y_train)
print('train loss: ', loss)
print('train accuracy: ', accuracy)
# 用模型评估 测试集
loss, accuracy = model.evaluate(X_test, y_test)
print('test loss: ', loss)
print('test accuracy: ', accuracy)

"""
预测样例
"""

def predict_sentiment(text):
print(text)
# 去标点
text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", text)
# 分词
cut = jieba.cut(text)
cut_list = [i for i in cut]
# tokenize
for i, word in enumerate(cut_list):
try:
cut_list[i] = cn_model.vocab[word].index
if cut_list[i] >= num_words:
cut_list[i] = 0
except KeyError:
cut_list[i] = 0
# padding
tokens_pad = pad_sequences([cut_list], maxlen=max_tokens,
padding='pre', truncating='pre')
# 预测
result = model.predict(x=tokens_pad)
coef = result[0][0]
if coef >= 0.5:
print('是一例正面评价', 'output=%.2f' % coef)
else:
print('是一例负面评价', 'output=%.2f' % coef)


test_list = [
'酒店设施不是新的,服务态度很不好',
'酒店卫生条件非常不好',
'床铺非常舒适',
'房间很凉,不给开暖气',
'房间很凉爽,空调冷气很足',
'酒店环境不好,住宿体验很不好',
'房间隔音不到位',
'晚上回来发现没有打扫卫生',
'因为过节所以要我临时加钱,比团购的价格贵'
]
for text in test_list[-2:]:
predict_sentiment(text)
panchaoxin wechat
关注我的公众号
支持一下