kaggle|word2vec和常见CNN+RNN网格结构组成的文本分类模型

【kaggle|word2vec和常见CNN+RNN网格结构组成的文本分类模型】作者为了应付毕业,所以在补充深度学习相关知识,这是我尝试把word2vec和深度学习相互结合的一次记录。

  • 数据集来源
  • 数据集预处理
  • 生成word2vec模型
  • 搭建网络并且训练

数据集来源
本文的数据集源自kaggle比赛中的NLP入门比赛,灾难新闻预报警。
数据集预处理
数据导入:
import numpy as np import pandas as pd train_df = pd.read_csv("data/train.csv") test_df = pd.read_csv("data/test.csv")

数据预处理:
import re import os, sys import string # 停用词 from nltk.corpus import stopwords # 小写 def text_to_lowercase(text): return text.lower() #去掉标点符号 def text_remove_punctuation(text): return text.translate(str.maketrans('', '', string.punctuation)) #去掉url def text_remove_url(text): return re.sub(r"http\S+", "", text) #去掉@符号 def text_remove_twitter_handle(text): return re.sub('@[^\s]+','',text) #去掉Python strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。 def text_remove_leadtrail_spaces(text): return text.strip() def clean_text(text): # order matters text1 = text_remove_twitter_handle(text) text2 = text_remove_url(text1) text3 = text_remove_punctuation(text2) text4 = text_to_lowercase(text3) text5 = text_remove_leadtrail_spaces(text4) return text5 # x = train_df["text"] # 类似于list遍历性操作 # y = [clean_text(i) for i in x] # text processing train_df['text_processed'] =[clean_text(i) for i in train_df["text"]] # x1 = test_df["text"] # y1 = [clean_text(i) for i in x1] # text processing #清洗数据 test_df['text_processed'] =[clean_text(i) for i in test_df["text"]] feature=train_df['text_processed'] target=train_df['target']

生成word2vec模型
from gensim.models import Word2Vec # 训练模型,词向量的长度设置为500# , 迭代次数为8# ,采用skip-gram模型# ,采用负采样# 窗口选择6# 最小词频是7# ,模型保存为pkl格式 w2v_model=Word2Vec(feature, size=500, sg=1,hs=0,window=6, iter=8,min_count=7) w2v_model.wv.save_word2vec_format("./word2Vec" + ".pkl", binary=True)

导入工具包
from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer from keras.utils import np_utils from keras.layers.merge import concatenate # 搭建模型 from keras.models import Sequential, Model # 这个是层的搭建 from keras.layers import Dense, Embedding, Activation, Input from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D from keras.layers importBatchNormalization from keras.layers import Convolution1D, Conv1D,MaxPooling1D from keras.layers import Dense, Embedding, Input, Lambda, Reshape from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D from keras.layers import LSTM, GRU, TimeDistributed, Bidirectional rom keras.utils import to_categorical

数据集分割和转数字
# 文本标签分类数量 NUM_CLASS=2 # 输入维度 INPUT_SIZE=64 # # 序列对齐文本数据 # Tokenizer是一个用于向量化文本,或将文本转换为序列 tokenizer = Tokenizer(filters='!"#$%&()*+,-./:; <=>?@[\\]^_`{|}~\t\n',lower=True,split=" ") tokenizer.fit_on_texts(feature) vocab = tokenizer.word_index x_ids=tokenizer.texts_to_sequences(feature) pad_s=pad_sequences(x_ids, maxlen=INPUT_SIZE) from keras.utils import to_categorical target_u=to_categorical(target,NUM_CLASS) from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test=train_test_split(pad_s,target_u,random_state=22,test_size=0.2)

搭建网络并且训练
给Embedding加入word2vec
embeding_matrix=np.zeros((len(vocab)+1,500)) for word,i in vocab.items(): try: embeding_vector=w2v_model[str(word)] embeding_matrix[i]=embeding_vector except KeyError: continue

textCNN模型加入word2vec
from keras.layers import Flatten,Dropout main_input=Input(shape=(INPUT_SIZE,),dtype='float64') embedder=Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True) embed=embedder(main_input) cnn1=Conv1D(256,3,padding='same',strides=1,activation='relu')(embed) cnn1=MaxPooling1D(pool_size=38)(cnn1) cnn2=Conv1D(256,4,padding='same',strides=1,activation='relu')(embed) cnn2=MaxPooling1D(pool_size=37)(cnn2) cnn3=Conv1D(256,5,padding='same',strides=1,activation='relu')(embed) cnn3=MaxPooling1D(pool_size=36)(cnn3) cnn=concatenate([cnn1,cnn2,cnn3],axis=-1) flat=Flatten()(cnn) drop=Dropout(0.2)(flat) main_output=Dense(NUM_CLASS,activation='softmax')(drop) model=Model(inputs=main_input,outputs=main_output) model.summary()

模型搭建结果:
kaggle|word2vec和常见CNN+RNN网格结构组成的文本分类模型
文章图片

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print('Train...') model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=https://www.it610.com/article/[X_test,y_test])

模型训练结果:
kaggle|word2vec和常见CNN+RNN网格结构组成的文本分类模型
文章图片

其他的模型
加了word2vec的CNN模型
model = Sequential() model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)) #使用Embeeding层将每个词编码转换为词向量 model.add(Conv1D(256, 5, padding='same')) model.add(MaxPooling1D(3, 3, padding='same')) model.add(Conv1D(128, 5, padding='same')) model.add(MaxPooling1D(3, 3, padding='same')) model.add(Conv1D(64, 3, padding='same')) model.add(Flatten()) model.add(Dropout(0.1)) model.add(BatchNormalization())# (批)规范化层 model.add(Dense(256, activation='relu')) model.add(Dropout(0.1)) model.add(Dense(NUM_CLASS, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print('Train...') model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=https://www.it610.com/article/[X_test,y_test])

加入了word2vec的RNN模型
model = Sequential() model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)) model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.1)) model.add(Dense(NUM_CLASS, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print('Train...') model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=https://www.it610.com/article/[X_test,y_test])

加入了word2vec的Bi-GRU
# 模型结构:词嵌入-双向GRU*2-全连接 model = Sequential() # 64是序列号 model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)) model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences=True))) model.add(Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1))) model.add(Dense(NUM_CLASS, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print('Train...') model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=https://www.it610.com/article/[X_test,y_test])

加入了word2vec的CNN+RNN 串联
# 模型结构:词嵌入-卷积池化-GRU*2-全连接 model = Sequential() model.add(Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)) model.add(Convolution1D(256, 3, padding='same', strides = 1)) model.add(Activation('relu')) model.add(MaxPool1D(pool_size=2)) model.add(GRU(256, dropout=0.2, recurrent_dropout=0.1, return_sequences = True)) model.add(GRU(256, dropout=0.2, recurrent_dropout=0.1)) model.add(Dense(NUM_CLASS, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print('Train...') model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=https://www.it610.com/article/[X_test,y_test])

加入了word2vec的 CNN+RNN 并联
# 模型结构:词嵌入-卷积池化-全连接 ---拼接-全连接 #-双向GRU-全连接 main_input = Input(shape=(INPUT_SIZE,), dtype='float64') embed = Embedding(len(vocab)+1,500,input_length=INPUT_SIZE,weights=[embeding_matrix],trainable=True)(main_input) cnn = Convolution1D(256, 3, padding='same', strides = 1, activation='relu')(embed) cnn = MaxPool1D(pool_size=4)(cnn) cnn = Flatten()(cnn) cnn = Dense(256)(cnn) rnn = Bidirectional(GRU(256, dropout=0.2, recurrent_dropout=0.1))(embed) rnn = Dense(256)(rnn) con = concatenate([cnn,rnn], axis=-1) main_output = Dense(NUM_CLASS, activation='softmax')(con) model = Model(inputs = main_input, outputs = main_output) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print('Train...') model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=https://www.it610.com/article/[X_test,y_test])


    推荐阅读