Tensorflow 文本分类实例

原创文章，转载请注明： 转载自慢慢的回味

实例学习

了解上文的Embedding和Pooling原理后，就可以理解Tensorflow的示例程序了：
1 加载测试数据IMDB imdb；
2 对每个样本进行padding操作，补齐到256长的数组（keras.preprocessing.sequence.pad_sequences），完成后为 mX256 的矩阵；
3 创建模型，先使用Embedding把输入的稀疏矩阵进行致密矩阵的映射，完成后为 mX256X16 的矩阵；
4 然后进行Pooling把矩阵降维，完成后为 mX16 的矩阵；
5 最后进行2个全连接操作，完成后分别为 mX16，mX1 的矩阵，由此得到每条评论的结果，好评或差评；
6 通过训练完成神经元参数的设置，然后可以对测试数据进行分类。

测试程序

from __future__ import absolute_import, division, print_function, unicode_literals
 
import tensorflow as tf
from tensorflow import keras
 
print(tf.__version__)
 
imdb = keras.datasets.imdb
 
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
 
print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))
 
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()
 
# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3
 
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
 
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
 
for i in range(0,10):
    print(decode_review(train_data[i]))
    print(train_labels[i])
 
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=256)
 
test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=256)
 
vocab_size = 10000
 
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
 
model.summary()
 
 
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
 
x_val = train_data[:10000]
partial_x_train = train_data[10000:]
 
y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]
 
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=40,
                    batch_size=512,
                    validation_data=(x_val, y_val),
                    verbose=1)
 
results = model.evaluate(test_data, test_labels)
 
print(results)
 
history_dict = history.history
 
import matplotlib.pyplot as plt
 
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']
 
epochs = range(1, len(acc) + 1)
 
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
 
plt.show()
 
 
plt.clf()   # clear figure
 
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
 
plt.show()

from __future__ import absolute_import, division, print_function, unicode_literals import tensorflow as tf from tensorflow import keras print(tf.__version__) imdb = keras.datasets.imdb (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels))) # A dictionary mapping words to an integer index word_index = imdb.get_word_index() # The first indices are reserved word_index = {k:(v+3) for k,v in word_index.items()} word_index["<PAD>"] = 0 word_index["<START>"] = 1 word_index["<UNK>"] = 2 # unknown word_index["<UNUSED>"] = 3 reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) def decode_review(text): return ' '.join([reverse_word_index.get(i, '?') for i in text]) for i in range(0,10): print(decode_review(train_data[i])) print(train_labels[i]) train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post', maxlen=256) test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post', maxlen=256) vocab_size = 10000 model = keras.Sequential() model.add(keras.layers.Embedding(vocab_size, 16)) model.add(keras.layers.GlobalAveragePooling1D()) model.add(keras.layers.Dense(16, activation='relu')) model.add(keras.layers.Dense(1, activation='sigmoid')) model.summary() model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) x_val = train_data[:10000] partial_x_train = train_data[10000:] y_val = train_labels[:10000] partial_y_train = train_labels[10000:] history = model.fit(partial_x_train, partial_y_train, epochs=40, batch_size=512, validation_data=(x_val, y_val), verbose=1) results = model.evaluate(test_data, test_labels) print(results) history_dict = history.history import matplotlib.pyplot as plt acc = history_dict['accuracy'] val_acc = history_dict['val_accuracy'] loss = history_dict['loss'] val_loss = history_dict['val_loss'] epochs = range(1, len(acc) + 1) # "bo" is for "blue dot" plt.plot(epochs, loss, 'bo', label='Training loss') # b is for "solid blue line" plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() plt.clf() # clear figure plt.plot(epochs, acc, 'bo', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show()

对Embedding和GlobalAveragePooling1D再做说明：
有如下的程序:
如果有2X10的文档输入矩阵，构建10X4的参数矩阵，Embedding后输出2X10X4的矩阵，经过GlobalAveragePooling1D后得到2X4的矩阵。这样就完成了从输入2X10的稀疏矩阵到2X4的致密矩阵转换。

params = tf.Variable(np.arange(40.0).reshape(10,4))
ids = tf.constant([[0,0,1,0,0,0,0,2,0,0],
                   [0,0,1,0,0,0,0,2,0,0]])
batch_embedded_tensor = embedding_ops.embedding_lookup(params=params, ids=ids, 
                                                 partition_strategy='mod', name="embedding")
print ("########## params #########")
print(params)
print ("########## ids #########")
print(ids)
print ("########## batch_embedded_tensor #########")
print(batch_embedded_tensor)
print ("########## math_ops.reduce_mean(input_tensor=batch_embedded_tensor, axis=[0,1], keepdims=True); #########")
print(math_ops.reduce_mean(input_tensor=batch_embedded_tensor, axis=[1], keepdims=True))
 
########## params #########
<tf.Variable 'Variable:0' shape=(10, 4) dtype=float64, numpy=
array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.],
       [12., 13., 14., 15.],
       [16., 17., 18., 19.],
       [20., 21., 22., 23.],
       [24., 25., 26., 27.],
       [28., 29., 30., 31.],
       [32., 33., 34., 35.],
       [36., 37., 38., 39.]])>
########## ids #########
tf.Tensor(
[[0 0 1 0 0 0 0 2 0 0]
 [0 0 1 0 0 0 0 2 0 0]], shape=(2, 10), dtype=int32)
########## batch_embedded_tensor #########
tf.Tensor(
[[[ 0.  1.  2.  3.]
  [ 0.  1.  2.  3.]
  [ 4.  5.  6.  7.]
  [ 0.  1.  2.  3.]
  [ 0.  1.  2.  3.]
  [ 0.  1.  2.  3.]
  [ 0.  1.  2.  3.]
  [ 8.  9. 10. 11.]
  [ 0.  1.  2.  3.]
  [ 0.  1.  2.  3.]]
 
 [[ 0.  1.  2.  3.]
  [ 0.  1.  2.  3.]
  [ 4.  5.  6.  7.]
  [ 0.  1.  2.  3.]
  [ 0.  1.  2.  3.]
  [ 0.  1.  2.  3.]
  [ 0.  1.  2.  3.]
  [ 8.  9. 10. 11.]
  [ 0.  1.  2.  3.]
  [ 0.  1.  2.  3.]]], shape=(2, 10, 4), dtype=float64)
########## math_ops.reduce_mean(input_tensor=batch_embedded_tensor, axis=[0,1], keepdims=True); #########
tf.Tensor(
[[[1.2 2.2 3.2 4.2]]
 
 [[1.2 2.2 3.2 4.2]]], shape=(2, 1, 4), dtype=float64)

本作品采用知识共享署名 4.0 国际许可协议进行许可。

2024年 12月
一	二	三	四	五	六	日
						1
2	3	4	5	6	7	8
9	10	11	12	13	14	15
16	17	18	19	20	21	22
23	24	25	26	27	28	29
30	31

实例学习

测试程序

发表回复 取消回复

发表回复取消回复