defgenerate_seg_file(input_file, output_seg_file): """Segment the sentences in each line in input_file""" with open(input_file, 'r', encoding='utf-8') as f: lines = f.readlines() with open(output_seg_file, 'w', encoding='utf-8') as f: for line in lines: label, content = line.strip('\r\n').split('\t') word_content = '' for word in jieba.cut(content): word = word.strip(' ') if word != '': word_content += word + ' ' out_line = '%s\t%s\n' % (label, word_content.strip(' ')) f.write(out_line)
defgenerate_vocab_file(input_seg_file, output_vocab_file): with open(input_seg_file, 'r', encoding="utf-8") as f: lines = f.readlines() word_dict = {} for line in lines: label, content = line.strip('\r\n').split('\t') for word in content.split(): word_dict.setdefault(word, 0) word_dict[word] += 1 # [(word, frequency), ..., ()] sorted_word_dict = sorted( word_dict.items(), key = lambda d:d[1], reverse=True) with open(output_vocab_file, 'w', encoding="utf-8") as f: f.write('<UNK>\t10000000\n') for item in sorted_word_dict: f.write('%s\t%d\n' % (item[0], item[1]))
defgenerate_category_dict(input_file, category_file): with open(input_file, 'r', encoding='utf-8') as f: lines = f.readlines() category_dict = {} for line in lines: label, content = line.strip('\r\n').split('\t') category_dict.setdefault(label, 0) category_dict[label] += 1 category_number = len(category_dict) with open(category_file, 'w', encoding='utf-8') as f: for category in category_dict: line = '%s\n' % category print('%s\t%d' % ( category, category_dict[category])) f.write(line)
def_read_dict(self, filename): """ 读取文件每一行,赋值一个id :param filename: :return: """ with open(filename, 'r', encoding='UTF-8') as f: lines = f.readlines() for line in lines: word, frequency = line.strip('\r\n').split('\t') frequency = int(frequency) if frequency < self._num_word_threshold: continue idx = len(self._word_to_id) if word == '<UNK>': self._unk = idx self._word_to_id[word] = idx
defsentence_to_id(self, sentence): word_ids = [self.word_to_id(cur_word) \ for cur_word in sentence.split()] return word_ids
classCategeoryDict: def__init__(self, filename): self._categeory_to_id = {} with open(filename, 'r', encoding='UTF-8') as f: lines = f.readlines() for line in lines: categeory = line.strip('\r\n') idx = len(self._categeory_to_id) self._categeory_to_id[categeory] = idx
defcategeory_to_id(self, categeory): ifnot categeory in self._categeory_to_id: raise Exception( "%s is not in our categeory list" % categeory) return self._categeory_to_id[categeory]
def_generate_params_for_lstm_cell(x_size, h_size, bias_size): x_w = tf.get_variable('x_weights', x_size) h_w = tf.get_variable('h_weights', h_size) b = tf.get_variable('biases', bias_size, initializer=tf.constant_initializer(0.0)) return x_w, h_w, b
with tf.variable_scope('lstm_nn', initializer=lstm_init): with tf.variable_scope('inputs'): ix, ih, ib = _generate_params_for_lstm_cell( x_size=[hps.num_embedding_size, hps.num_lstm_nodes[0]], h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]], bias_size=[1, hps.num_lstm_nodes[0]] ) with tf.variable_scope('outputs'): ox, oh, ob = _generate_params_for_lstm_cell( x_size=[hps.num_embedding_size, hps.num_lstm_nodes[0]], h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]], bias_size=[1, hps.num_lstm_nodes[0]] ) with tf.variable_scope('forget'): fx, fh, fb = _generate_params_for_lstm_cell( x_size=[hps.num_embedding_size, hps.num_lstm_nodes[0]], h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]], bias_size=[1, hps.num_lstm_nodes[0]] ) with tf.variable_scope('memory'): cx, ch, cb = _generate_params_for_lstm_cell( x_size=[hps.num_embedding_size, hps.num_lstm_nodes[0]], h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]], bias_size=[1, hps.num_lstm_nodes[0]] )
state = tf.Variable(tf.zeros([batch_size, hps.num_lstm_nodes[0]]), trainable=False) h = tf.Variable(tf.zeros([batch_size, hps.num_lstm_nodes[0]]), trainable=False)
for i in range(num_timesteps): embed_input = embed_inputs[:, i, :] embed_input = tf.reshape(embed_input, [batch_size, hps.num_embedding_size]) forget_gate = tf.sigmoid(tf.matmul(embed_input, fx) + tf.matmul(h, fh) + fb) input_gate = tf.sigmoid(tf.matmul(embed_input, ix) + tf.matmul(h, ih) + ib) output_gate = tf.sigmoid(tf.matmul(embed_input, ox) + tf.matmul(h, oh) + ob) mid_state = tf.tanh(tf.matmul(embed_input, cx) + tf.matmul(h, ch) + cb) state = mid_state * input_gate + state * forget_gate h = output_gate * tf.tanh(state) last = h ''' cells = [] for i in range(hps.num_lstm_layers): cell = tf.contrib.rnn.BasicLSTMCell(hps.num_lstm_nodes[i], state_is_tuple = True) cell = tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=keep_prob) cells.append(cell) cell = tf.contrib.rnn.MultiRNNCell(cells)#Cell is 多层LSTM initial_state = cell.zero_state(batch_size,tf.float32) #初始化隐藏状态为0 #RNNoutput: # 一维:batch_size # 二维:num_timesteps # 三维:lstm_outputs[-1] rnn_outputs, _ = tf.nn.dynamic_rnn(cell,embed_inputs,initial_state=initial_state) last = rnn_outputs[:, -1, : ] ''' '''FC层搭建''' fc_init = tf.uniform_unit_scaling_initializer(factor=1.0) with tf.variable_scope('fc', initializer=fc_init): fc1 = tf.layers.dense(last, hps.num_fc_nodes, activation=tf.nn.relu, name='fc1') fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob) logits = tf.layers.dense(fc1_dropout, num_classes, name='fc2')