当前位置:网站首页>Seq2seq implements chat robot

Seq2seq implements chat robot

2021-08-08 00:13:13 ZSYL

1. Prepare training data

Single round Chat data is very difficult to obtain , So here we start from github Use some open data sets to train our chat model

Data address :https://github.com/codemayq/chaotbot_corpus_Chinese

There are two main data :

  1. Chat corpus of little yellow chicken : There's a lot of noise
     Insert picture description here

  2. Microblog titles and comments : The quality is relatively high
     Insert picture description here
     Insert picture description here

2. Data processing and storage

Because there is a lot of noise in the data , It can be processed basically , And then put input and target Use two files to save , namely input No N Ask at the end of the line ,target Of the N Behavior answer

Later, we may take a single word as a feature ( Store in input_word.txt), Words may also be used as features (input.txt)

2.1 Processing of corpus of little yellow chicken

def format_xiaohuangji_corpus(word=False):
    """ Processing the corpus of little yellow chicken """
    if word:
        corpus_path = "./chatbot/corpus/xiaohuangji50w_nofenci.conv"
        input_path = "./chatbot/corpus/input_word.txt"
        output_path = "./chatbot/corpus/output_word.txt"
    else:

        corpus_path = "./chatbot/corpus/xiaohuangji50w_nofenci.conv"
        input_path = "./chatbot/corpus/input.txt"
        output_path = "./chatbot/corpus/output.txt"

    f_input = open(input_path,"a")
    f_output = open(output_path,"a")
    pair = []
    for line in tqdm(open(corpus_path),ascii=True):
        if line.strip() == "E":
            if not pair:
                continue
            else:
                assert len(pair) == 2," The length must be 2"
                if len(pair[0].strip())>=1 and len(pair[1].strip())>=1:
                    f_input.write(pair[0]+"\n")
                    f_output.write(pair[1]+"\n")
                pair = []
        elif line.startswith("M"):
            line = line[1:]
            if word:
                pair.append(" ".join(list(line.strip())))
            else:
                pair.append(" ".join(jieba_cut(line.strip())))

2.2 Processing of microblog corpus

def format_weibo(word=False):
    """  There is some noise in microblog data , Untreated  :return: """
    if word:
        origin_input = "./chatbot/corpus/stc_weibo_train_post"
        input_path = "./chatbot/corpus/input_word.txt"

        origin_output = "./chatbot/corpus/stc_weibo_train_response"
        output_path = "./chatbot/corpus/output_word.txt"

    else:
        origin_input = "./chatbot/corpus/stc_weibo_train_post"
        input_path = "./chatbot/corpus/input.txt"

        origin_output = "./chatbot/corpus/stc_weibo_train_response"
        output_path = "./chatbot/corpus/output.txt"

    f_input = open(input_path,"a")
    f_output = open(output_path, "a")
    with open(origin_input) as in_o,open(origin_output) as out_o:
        for _in,_out in tqdm(zip(in_o,out_o),ascii=True):
            _in = _in.strip()
            _out = _out.strip()

            if _in.endswith(")") or _in.endswith("」") or _in.endswith(")"):
                _in = re.sub("(.*)|「.*?」|\(.*?\)"," ",_in)
            _in = re.sub(" I am here .*?alink|alink|(.*?\d+x\d+.*?)|#|】|【|-+|_+|via.*?:*.*"," ",_in)

            _in = re.sub("\s+"," ",_in)
            if len(_in)<1 or len(_out)<1:
                continue

            if word:
                _in = re.sub("\s+","",_in)  # Convert to a whole line , No spaces 
                _out = re.sub("\s+","",_out)
                if len(_in)>=1 and len(_out)>=1:
                    f_input.write(" ".join(list(_in)) + "\n")
                    f_output.write(" ".join(list(_out)) + "\n")
            else:
                if len(_in) >= 1 and len(_out) >= 1:
                    f_input.write(_in.strip()+"\n")
                    f_output.write(_out.strip()+"\n")

    f_input.close()
    f_output.close()

2.3 The result of the treatment

 Insert picture description here

 Insert picture description here

3. Construct text serialization and deserialization methods

Same as before , Need to convert text into numbers , At the same time, we also need to implement methods to convert numbers into text

# word_sequence.py
import config
import pickle

class Word2Sequence():
    UNK_TAG = "UNK"
    PAD_TAG = "PAD"
    SOS_TAG = "SOS"
    EOS_TAG = "EOS"

    UNK = 0
    PAD = 1
    SOS = 2
    EOS = 3

    def __init__(self):
        self.dict = {
    
            self.UNK_TAG :self.UNK,
            self.PAD_TAG :self.PAD,
            self.SOS_TAG :self.SOS,
            self.EOS_TAG :self.EOS
        }
        self.count = {
    }
        self.fited = False

    def to_index(self,word):
        """word -> index"""
        assert self.fited == True," It has to be done first fit operation "
        return self.dict.get(word,self.UNK)

    def to_word(self,index):
        """index -> word"""
        assert self.fited , " It has to be done first fit operation "
        if index in self.inversed_dict:
            return self.inversed_dict[index]
        return self.UNK_TAG

    def __len__(self):
        return len(self.dict)

    def fit(self, sentence):
        """ :param sentence:[word1,word2,word3] :param min_count:  Minimum number of occurrences  :param max_count:  Maximum number of occurrences  :param max_feature:  Maximum number of total words  :return: """
        for a in sentence:
            if a not in self.count:
                self.count[a] = 0
            self.count[a] += 1

        self.fited = True

    def build_vocab(self, min_count=1, max_count=None, max_feature=None):

        #  Larger than the smallest quantity and smaller than the largest quantity 
        if min_count is not None:
            self.count = {
    k: v for k, v in self.count.items() if v >= min_count}
        if max_count is not None:
            self.count = {
    k: v for k, v in self.count.items() if v <= max_count}

        #  Limit the maximum number 
        if isinstance(max_feature, int):
            count = sorted(list(self.count.items()), key=lambda x: x[1])
            if max_feature is not None and len(count) > max_feature:
                count = count[-int(max_feature):]
            for w, _ in count:
                self.dict[w] = len(self.dict)
        else:
            for w in sorted(self.count.keys()):
                self.dict[w] = len(self.dict)

        #  Prepare one index->word Dictionary 
        self.inversed_dict = dict(zip(self.dict.values(), self.dict.keys()))

    def transform(self, sentence,max_len=None,add_eos=False):
        """  Realize the sentence into an array ( vector ) :param sentence: :param max_len: :return: """
        assert self.fited, " It has to be done first fit operation "

        r = [self.to_index(i) for i in sentence]
        if max_len is not None:
            if max_len>len(sentence):
                if add_eos:
                    r+=[self.EOS]+[self.PAD for _ in range(max_len-len(sentence)-1)]
                else:
                    r += [self.PAD for _ in range(max_len - len(sentence))]
            else:
                if add_eos:
                    r = r[:max_len-1]
                    r += [self.EOS]
                else:
                    r = r[:max_len]
        else:
            if add_eos:
                r += [self.EOS]
        # print(len(r),r)
        return r

    def inverse_transform(self,indices):
        """  From the array   Turn into   vector  :param indices: [1,2,3....] :return:[word1,word2.....] """
        sentence = []
        for i in indices:
            word = self.to_word(i)
            sentence.append(word)
        return sentence

# Then import the word_sequence Use 
word_sequence = pickle.load(open("./pkl/ws.pkl","rb")) if not config.use_word else pickle.load(open("./pkl/ws_word.pkl","rb"))



if __name__ == '__main__':
    from word_sequence import Word2Sequence
    from tqdm import tqdm
    import pickle

    word_sequence = Word2Sequence()
    # Word level 
    input_path = "../corpus/input.txt"
    target_path = "../corpus/output.txt"
    for line in tqdm(open(input_path).readlines()):
        word_sequence.fit(line.strip().split())
    for line in tqdm(open(target_path).readlines()):
        word_sequence.fit(line.strip().split())
	
    # Use max_feature=5000 Data 
    word_sequence.build_vocab(min_count=5,max_count=None,max_feature=5000)
    print(len(word_sequence))
    pickle.dump(word_sequence,open("./pkl/ws.pkl","wb"))

4. structure Dataset and DataLoader

establish dataset.py file , Prepare the dataset

import torch
import config
from torch.utils.data import Dataset,DataLoader
from word_sequence import word_sequence


class ChatDataset(Dataset):
    def __init__(self):
        super(ChatDataset,self).__init__()

        input_path = "../corpus/input.txt"
        target_path = "../corpus/output.txt"
        if config.use_word:
            input_path = "../corpus/input_word.txt"
            target_path = "../corpus/output_word.txt"

        self.input_lines = open(input_path).readlines()
        self.target_lines = open(target_path).readlines()
        assert len(self.input_lines) == len(self.target_lines) ,"input and target The number of texts must be the same "
    def __getitem__(self, index):
        input = self.input_lines[index].strip().split()
        target = self.target_lines[index].strip().split()
        if len(input) == 0 or len(target)==0:
            input = self.input_lines[index+1].strip().split()
            target = self.target_lines[index+1].strip().split()
        # If the length of the sentence here is greater than max_len, Then you should return max_len
        return input,target,min(len(input),config.max_len),min(len(target),config.max_len)

    def __len__(self):
        return len(self.input_lines)

def collate_fn(batch):
    #1. Sort 
    batch = sorted(batch,key=lambda x:x[2],reverse=True)
    input, target, input_length, target_length = zip(*batch)

    # 2. Conduct padding The operation of 
    input = torch.LongTensor([word_sequence.transform(i, max_len=config.max_len) for i in input])
    target = torch.LongTensor([word_sequence.transform(i, max_len=config.max_len, add_eos=True) for i in target])
    input_length = torch.LongTensor(input_length)
    target_length = torch.LongTensor(target_length)

    return input, target, input_length, target_length

data_loader = DataLoader(dataset=ChatDataset(),batch_size=config.batch_size,shuffle=True,collate_fn=collate_fn,drop_last=True)

if __name__ == '__main__':
    for idx, (input, target, input_lenght, target_length) in enumerate(data_loader):
        print(idx)
        print(input)
        print(target)
        print(input_lenght)
        print(target_length)

5. complete encoder Encoder logic

import torch.nn as nn
from word_sequence import word_sequence
import config


class Encoder(nn.Module):
    def __init__(self):
        super(Encoder,self).__init__()
        self.vocab_size = len(word_sequence)
        self.dropout = config.dropout
        self.embedding_dim = config.embedding_dim
        self.embedding = nn.Embedding(num_embeddings=self.vocab_size,embedding_dim=self.embedding_dim,padding_idx=word_sequence.PAD)
        self.gru = nn.GRU(input_size=self.embedding_dim,
                          hidden_size=config.hidden_size,
                          num_layers=1,
                          batch_first=True,
                          dropout=config.dropout)

    def forward(self, input,input_length):
        embeded = self.embedding(input)
        embeded = nn.utils.rnn.pack_padded_sequence(embeded,lengths=input_length,batch_first=True)

        #hidden:[1,batch_size,vocab_size]
        out,hidden = self.gru(embeded)
        out,outputs_length = nn.utils.rnn.pad_packed_sequence(out,batch_first=True,padding_value=word_sequence.PAD)
        #hidden [1,batch_size,hidden_size]
        return out,hidden

6. complete decoder The logic of the decoder

import torch
import torch.nn as nn
import config
import random
import torch.nn.functional as F
from word_sequence import word_sequence

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder,self).__init__()
        self.max_seq_len = config.max_len
        self.vocab_size = len(word_sequence)
        self.embedding_dim = config.embedding_dim
        self.dropout = config.dropout

        self.embedding = nn.Embedding(num_embeddings=self.vocab_size,embedding_dim=self.embedding_dim,padding_idx=word_sequence.PAD)
        self.gru = nn.GRU(input_size=self.embedding_dim,
                          hidden_size=config.hidden_size,
                          num_layers=1,
                          batch_first=True,
                          dropout=self.dropout)
        self.log_softmax = nn.LogSoftmax()

        self.fc = nn.Linear(config.hidden_size,self.vocab_size)

    def forward(self, encoder_hidden,target,target_length):
        # encoder_hidden [batch_size,hidden_size]
        # target [batch_size,seq-len]

        decoder_input = torch.LongTensor([[word_sequence.SOS]]*config.batch_size).to(config.device)
        decoder_outputs = torch.zeros(config.batch_size,config.max_len,self.vocab_size).to(config.device) #[batch_size,seq_len,14]

        decoder_hidden = encoder_hidden #[batch_size,hidden_size]

        for t in range(config.max_len):
            decoder_output_t , decoder_hidden = self.forward_step(decoder_input,decoder_hidden)
            decoder_outputs[:,t,:] = decoder_output_t
            value, index = torch.topk(decoder_output_t, 1) # index [batch_size,1]
            decoder_input = index
        return decoder_outputs,decoder_hidden

    def forward_step(self,decoder_input,decoder_hidden):
        """ :param decoder_input:[batch_size,1] :param decoder_hidden: [1,batch_size,hidden_size] :return: out:[batch_size,vocab_size],decoder_hidden:[1,batch_size,didden_size] """
        embeded = self.embedding(decoder_input)  #embeded: [batch_size,1 , embedding_dim]
        out,decoder_hidden = self.gru(embeded,decoder_hidden) #out [1, batch_size, hidden_size]
        out = out.squeeze(0)
        out = F.log_softmax(self.fc(out),dim=-1)#[batch_Size, vocab_size]
        out = out.squeeze(1)
        # print("out size:",out.size(),decoder_hidden.size())
        return out,decoder_hidden

7. complete seq2seq Model of

import torch
import torch.nn as nn

class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder):
        super(Seq2Seq,self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input,target,input_length,target_length):
        encoder_outputs,encoder_hidden = self.encoder(input,input_length)
        decoder_outputs,decoder_hidden = self.decoder(encoder_hidden,target,target_length)
        return decoder_outputs,decoder_hidden

    def evaluation(self,inputs,input_length):
        encoder_outputs,encoder_hidden = self.encoder(inputs,input_length)
        decoded_sentence = self.decoder.evaluation(encoder_hidden)
        return decoded_sentence

8. Complete the training logic

To speed up training , You can consider gpu Up operation , So, before we take the lead, we'll take the lead tensor and model All need to be transformed into CUDA Types of support .

The current amount of data is 500 More than ten thousand , stay GTX1070(8G memory ) Training , Probably need 90 Divide it into epoch, Wait patiently

import torch
import config
from torch import optim
import torch.nn as nn
from encoder import Encoder
from decoder import Decoder
from seq2seq import Seq2Seq
from dataset import data_loader as train_dataloader
from word_sequence import word_sequence

encoder = Encoder()
decoder = Decoder()
model = Seq2Seq(encoder,decoder)

#device stay config Implementation in file 
model.to(config.device)

print(model)

model.load_state_dict(torch.load("model/seq2seq_model.pkl"))
optimizer =  optim.Adam(model.parameters())
optimizer.load_state_dict(torch.load("model/seq2seq_optimizer.pkl"))
criterion= nn.NLLLoss(ignore_index=word_sequence.PAD,reduction="mean")

def get_loss(decoder_outputs,target):
    target = target.view(-1) #[batch_size*max_len]
    decoder_outputs = decoder_outputs.view(config.batch_size*config.max_len,-1)
    return criterion(decoder_outputs,target)


def train(epoch):
    for idx,(input,target,input_length,target_len) in enumerate(train_dataloader):
        input = input.to(config.device)
        target = target.to(config.device)
        input_length = input_length.to(config.device)
        target_len = target_len.to(config.device)

        optimizer.zero_grad()
        ##[seq_len,batch_size,vocab_size] [batch_size,seq_len]
        decoder_outputs,decoder_hidden = model(input,target,input_length,target_len)
        loss = get_loss(decoder_outputs,target)
        loss.backward()
        optimizer.step()

        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, idx * len(input), len(train_dataloader.dataset),
                   100. * idx / len(train_dataloader), loss.item()))

        torch.save(model.state_dict(), "model/seq2seq_model.pkl")
        torch.save(optimizer.state_dict(), 'model/seq2seq_optimizer.pkl')

if __name__ == '__main__':
    for i in range(10):
        train(i)

Training 10 individual epoch The effect is as follows , It can be seen that the loss is still high :

Train Epoch: 9 [2444544/4889919 (50%)]	Loss: 4.923604
Train Epoch: 9 [2444800/4889919 (50%)]	Loss: 4.364594
Train Epoch: 9 [2445056/4889919 (50%)]	Loss: 4.613254
Train Epoch: 9 [2445312/4889919 (50%)]	Loss: 4.143538
Train Epoch: 9 [2445568/4889919 (50%)]	Loss: 4.412729
Train Epoch: 9 [2445824/4889919 (50%)]	Loss: 4.516526
Train Epoch: 9 [2446080/4889919 (50%)]	Loss: 4.124945
Train Epoch: 9 [2446336/4889919 (50%)]	Loss: 4.777015
Train Epoch: 9 [2446592/4889919 (50%)]	Loss: 4.358538
Train Epoch: 9 [2446848/4889919 (50%)]	Loss: 4.513412
Train Epoch: 9 [2447104/4889919 (50%)]	Loss: 4.202757
Train Epoch: 9 [2447360/4889919 (50%)]	Loss: 4.589584

The result is bad !

9. decoder Add evaluation method to

decoder Add evaluation method to

def evaluate(self, encoder_hidden):
	 """  assessment ,  and fowward Logical analogous  :param encoder_hidden: encoder Last time step The hidden state of  [1, batch_size, hidden_size] :return: """
	 batch_size = encoder_hidden.size(1)
	 #  Initialize a [batch_size, 1] Of SOS tensor , As the first time step Output 
	 decoder_input = torch.LongTensor([[config.target_ws.SOS]] * batch_size).to(config.device)
	 # encoder_hidden  As decoder First time step hidden [1, batch_size, hidden_size]
	 decoder_hidden = encoder_hidden
	 #  initialization [batch_size, seq_len, vocab_size] Of outputs  Splice each time step result 
	 decoder_outputs = torch.zeros((batch_size, config.chatbot_target_max_len, self.vocab_size)).to(config.device)
	 #  Initialize an empty list , Store each prediction sequence 
	 predict_result = []
	 #  Update each time step 
	 for t in range(config.chatbot_target_max_len):
	     decoder_output_t, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
	     #  Splice each time step,decoder_output_t [batch_size, vocab_size]
	     decoder_outputs[:, t, :] = decoder_output_t
	     #  Because it's an assessment , You need to get the predicted value every time 
	     index = torch.argmax(decoder_output_t, dim = -1)
	     #  Update the input for the next time step 
	     decoder_input = index.unsqueeze(1)
	     #  Store the prediction sequence for each time step 
	     predict_result.append(index.cpu().detach().numpy()) # [[batch], [batch]...] ->[seq_len, vocab_size]
	 #  The result is converted to ndarry, Each line is a prediction result, that is, the index corresponding to a single word ,  All actions seq_len length 
	 predict_result = np.array(predict_result).transpose()  # (batch_size, seq_len) Of array
	 return decoder_outputs, predict_result

eval.py

import torch
import torch.nn as nn
import torch.nn.functional as F
from dataset import get_dataloader
import config
import numpy as np
from Seq2Seq import Seq2SeqModel
import os
from tqdm import tqdm
 
 
 
model = Seq2SeqModel().to(config.device)
if os.path.exists('./model/chatbot_model.pkl'):
    model.load_state_dict(torch.load('./model/chatbot_model.pkl'))
 
 
def eval():
    model.eval()
    loss_list = []
    test_data_loader = get_dataloader(train = False)
    with torch.no_grad():
        bar = tqdm(test_data_loader, desc = 'testing', total = len(test_data_loader))
        for idx, (input, target, input_length, target_length) in enumerate(bar):
            input = input.to(config.device)
            target = target.to(config.device)
            input_length = input_length.to(config.device)
            target_length = target_length.to(config.device)
            #  Get the prediction results of the model 
            decoder_outputs, predict_result = model.evaluation(input, input_length)
            #  Calculate the loss 
            loss = F.nll_loss(decoder_outputs.view(-1, len(config.target_ws)), target.view(-1), ignore_index = config.target_ws.PAD)
            loss_list.append(loss.item())
            bar.set_description('idx{}:/{}, loss:{}'.format(idx, len(test_data_loader), np.mean(loss_list)))
 
 
if __name__ == '__main__':
    eval()

interface.py:

from cut_sentence import cut
import torch
import config
from Seq2Seq import Seq2SeqModel
import os
 
 
#  Simulated chat scene , Answer the user's input 
def interface():
    #  Load the model of the training set 
    model = Seq2SeqModel().to(config.device)
    assert os.path.exists('./model/chatbot_model.pkl') , ' Please train the model first !'
    model.load_state_dict(torch.load('./model/chatbot_model.pkl'))
    model.eval()
 
    while True:
        #  Enter the original string , Do word segmentation 
        input_string = input('me>>:')
        if input_string == 'q':
            print(' Chat next time ')
            break
        input_cuted = cut(input_string, by_word = True)
        #  Perform sequence conversion and tensor encapsulation 
        input_tensor = torch.LongTensor([config.input_ws.transfrom(input_cuted, max_len = config.chatbot_input_max_len)]).to(config.device)
        input_length_tensor = torch.LongTensor([len(input_cuted)]).to(config.device)
        #  Get forecast results 
        outputs, predict = model.evaluation(input_tensor, input_length_tensor)
        #  Perform sequence conversion text 
        result = config.target_ws.inverse_transform(predict[0])
        print('chatbot>>:', result)
 
 
if __name__ == '__main__':
    interface()

config.py:

import torch
from word_sequence import WordSequence
 
 
chatbot_input_path = './corpus/input.txt'
chatbot_target_path = './corpus/target.txt'
 
word_sequence = WordSequence()
max_len = 9
batch_size = 128
embedding_dim = 100
num_layer = 1
hidden_size = 64
dropout = 0.1
model_save_path = './model.pkl'
optimizer_save_path = './optimizer.pkl'
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

cut.py:

"""  participle  """
import jieba
import config1
import string
import jieba.posseg as psg  #  Return part of speech 
from lib.stopwords import stopwords
 
#  Load Dictionary 
jieba.load_userdict(config1.user_dict_path)
#  Prepare English characters 
letters = string.ascii_lowercase + '+'
 
 
def cut_sentence_by_word(sentence):
    """ Realize Chinese and English word segmentation """
    temp = ''
    result = []
    for word in sentence:
        if word.lower() in letters:
            #  If it's English characters , Concatenate empty strings 
            temp += word
        else:
            #  After meeting Chinese characters , Add English to the result first 
            if temp != '':
                result.append(temp.lower())
                temp = ''
            result.append(word.strip())
    if temp != '':
        #  If English appears at the end 
        result.append(temp.lower())
    return result
 
 
def cut(sentence, by_word=False, use_stopwords=True, with_sg=False):
    """ :param sentence:  The sentence  :param by_word: T Break a word according to a single word or F The sentence  :param use_stopwords:  Whether to use stop words , Default False :param with_sg:  Whether to return part of speech  :return: """
    if by_word:
        result = cut_sentence_by_word(sentence)
    else:
        result = psg.lcut(sentence)
        # psg  Source return i.word,i.flag  Namely word , Defined part of speech 
        result = [(i.word, i.flag) for i in result]
        #  Whether to return part of speech 
        if not with_sg:
            result = [i[0] for i in result]
    #  Whether to use stop words 
    if use_stopwords:
        result = [i for i in result if i not in stopwords]
 
    return result

 Insert picture description here
Reference resources link

版权声明
本文为[ZSYL]所创,转载请带上原文链接,感谢
https://chowdera.com/2021/08/20210808001155765O.html

随机推荐