1. 处理

1.1. 统一序列的维度

处理文本序列时，维度必须统一。文本A分词之后有200个token，文本B分词之后有1000个token，你必须统一成一个维度，例如限定序列长度是500。文本A不到500个token，用0填充，文本B就截取一部分

# 求出所有序列的长度
num_tokens = [len(tokens) for tokens in train_tokens]
num_tokens = np.array(num_tokens)
# 序列长度统一成  长度的平均值 +２个标准差
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)

2. 代码

"""
    导包
"""
import numpy as np
import matplotlib.pyplot as plt
from keras.utils.vis_utils import plot_model
import re
import jieba
from gensim.models import KeyedVectors
import os
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from sklearn.model_selection import train_test_split

"""
    加载词向量模型
"""
cn_model = KeyedVectors.load_word2vec_format(
    'C:/Users/Administrator/Desktop/project/pyproject/pytest/data/sgns.literature.word', binary=False)

embedding_dim = 300

"""
    加载数据
"""
pos_txts = os.listdir('pos')
neg_txts = os.listdir('neg')
# 存储所有评价，每例评价为一条string
train_texts_orig = []
for i in range(len(pos_txts)):
    with open('pos/' + pos_txts[i], 'r', errors='ignore') as f:
        text = f.read().strip()
        train_texts_orig.append(text)
        f.close()
for i in range(len(neg_txts)):
    with open('neg/' + neg_txts[i], 'r', errors='ignore') as f:
        text = f.read().strip()
        train_texts_orig.append(text)
        f.close()

"""
    分词和tokenize
"""
train_tokens = []
for text in train_texts_orig:
    # 去掉标点
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "", text)
    # 结巴分词
    cut = jieba.cut(text)
    # 结巴分词的输出结果为一个生成器
    # 把生成器转换为list
    cut_list = [i for i in cut]
    for i, word in enumerate(cut_list):
        try:
            # 将词转换为索引index
            cut_list[i] = cn_model.vocab[word].index
        except KeyError:
            # 如果词不在字典中，则输出0
            cut_list[i] = 0
    train_tokens.append(cut_list)

""" 
    索引长度标准化
"""
num_tokens = [len(tokens) for tokens in train_tokens]
num_tokens = np.array(num_tokens)

max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)

"""
    准备Embedding Matrix
"""

num_words = 200000
embedding_matrix = np.zeros((num_words, embedding_dim))
for i in range(num_words):
    embedding_matrix[i, :] = cn_model[cn_model.index2word[i]]
embedding_matrix = embedding_matrix.astype('float32')

"""
    padding（填充）和truncating（修剪）
"""
train_pad = pad_sequences(train_tokens, maxlen=max_tokens,
                          padding='pre', truncating='pre')

train_pad[train_pad >= num_words] = 0

# 准备target向量（训练集label），前2000样本为1，后2000为0
train_target = np.concatenate((np.ones(2000), np.zeros(2000)))

X_train, X_test, y_train, y_test = train_test_split(train_pad,
                                                    train_target,
                                                    test_size=0.1,
                                                    random_state=12)

"""
    构建网络
"""
model = Sequential([
    Embedding(num_words,
              embedding_dim,
              weights=[embedding_matrix],
              input_length=max_tokens,
              trainable=False),
    Bidirectional(LSTM(units=32, return_sequences=True)),
    LSTM(units=16, return_sequences=False),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

"""
    绘制网络结构
"""
plot_model(model, to_file="model.png", show_shapes=True, show_layer_names="False", rankdir="TB")
img = plt.imread("model.png")
plt.imshow(img)
plt.show()

"""
    存储
"""
# 建立一个权重的存储点
path_checkpoint = 'sentiment_checkpoint.keras'
checkpoint = ModelCheckpoint(filepath=path_checkpoint, monitor='val_loss',
                             verbose=1, save_weights_only=True,
                             save_best_only=True)
# 尝试加载已训练模型
try:
    model.load_weights(path_checkpoint)
except Exception as e:
    print(e)

"""
    训练
"""
# 定义early stoping如果3个epoch内validation loss没有改善则停止训练
earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
# 自动降低learning rate
lr_reduction = ReduceLROnPlateau(monitor='val_loss',
                                 factor=0.1, min_lr=1e-5, patience=0,
                                 verbose=1)
# 定义callback函数
callbacks = [earlystopping, checkpoint, lr_reduction]
# 开始训练
model.fit(X_train, y_train,
          validation_split=0.1,
          epochs=10,
          batch_size=128,
          callbacks=callbacks)

"""
    评估
"""
# 用模型评估 训练集
loss, accuracy = model.evaluate(X_train, y_train)
print('train loss: ', loss)
print('train accuracy: ', accuracy)
# 用模型评估 测试集
loss, accuracy = model.evaluate(X_test, y_test)
print('test loss: ', loss)
print('test accuracy: ', accuracy)

"""
    预测样例
"""

def predict_sentiment(text):
    print(text)
    # 去标点
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "", text)
    # 分词
    cut = jieba.cut(text)
    cut_list = [i for i in cut]
    # tokenize
    for i, word in enumerate(cut_list):
        try:
            cut_list[i] = cn_model.vocab[word].index
            if cut_list[i] >= num_words:
                cut_list[i] = 0
        except KeyError:
            cut_list[i] = 0
    # padding
    tokens_pad = pad_sequences([cut_list], maxlen=max_tokens,
                               padding='pre', truncating='pre')
    # 预测
    result = model.predict(x=tokens_pad)
    coef = result[0][0]
    if coef >= 0.5:
        print('是一例正面评价', 'output=%.2f' % coef)
    else:
        print('是一例负面评价', 'output=%.2f' % coef)


test_list = [
    '酒店设施不是新的，服务态度很不好',
    '酒店卫生条件非常不好',
    '床铺非常舒适',
    '房间很凉，不给开暖气',
    '房间很凉爽，空调冷气很足',
    '酒店环境不好，住宿体验很不好',
    '房间隔音不到位',
    '晚上回来发现没有打扫卫生',
    '因为过节所以要我临时加钱，比团购的价格贵'
]
for text in test_list[-2:]:
    predict_sentiment(text)