做感恩网站的图片大全,网站菜单模板,自动做reference的网站,吉林省建设工程管理系统本小节的主要任务即是将wiki数据集转成BERT输入序列#xff0c;具体的任务包括#xff1a;
读取wiki数据集生成下一句预测任务的数据—主要用于_get_nsp_data_from_paragraph函数从输入paragraph生成用于下一句预测的训练样本#xff1a;_get_nsp_data_from_paragraph生…本小节的主要任务即是将wiki数据集转成BERT输入序列具体的任务包括
读取wiki数据集生成下一句预测任务的数据—主要用于_get_nsp_data_from_paragraph函数从输入paragraph生成用于下一句预测的训练样本_get_nsp_data_from_paragraph生成遮蔽语言模型任务的数据—将生成的tokens的一部分随机换成masked的tokens用于_get_mlm_data_from_tokens函数得到掩蔽语言模型的数据_get_mlm_data_from_tokens函数将输入信息附加特殊词元 mask 下载并生成WikiText-2数据集并从中生成预训练样本:load_data_wiki函数得到train_iter与vocab
较小的语料库WikiText-2
import os
import random
import torch
from d2l import torch as d2l
#save
d2l.DATA_HUB[wikitext-2] (https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip, 3c914d17d80b1459be871a5039ac23e752a53cbe)
仅使用句号作为分隔符来拆分句子
#save
def _read_wiki(data_dir):file_name os.path.join(data_dir, wiki.train.tokens)with open(file_name, r,encodingutf-8) as f:lines f.readlines()# 大写字母转换为小写字母paragraphs [line.strip().lower().split( . )for line in lines if len(line.split( . )) 2]random.shuffle(paragraphs)return paragraphs# 生成下一句预测任务的数据---用于_get_nsp_data_from_paragraph函数
#save
def _get_next_sentence(sentence, next_sentence, paragraphs):if random.random() 0.5:is_next Trueelse:# paragraphs是三重列表的嵌套next_sentence random.choice(random.choice(paragraphs))is_next Falsereturn sentence, next_sentence, is_next
下面的函数通过调用_get_next_sentence函数从输入paragraph生成用于下一句预测的训练样本。
这里paragraph是句子列表其中每个句子都是词元列表。自变量max_len指定预训练期间的BERT输入序列的最大长度。#save
def _get_nsp_data_from_paragraph(paragraph, paragraphs, vocab, max_len):nsp_data_from_paragraph []nsp_data_from_paragraph中的每一个元素都是(tokens,segments,is_next)(词元句子属性是否是下一个句子)for i in range(len(paragraph) - 1):tokens_a, tokens_b, is_next _get_next_sentence(paragraph[i], paragraph[i 1], paragraphs)# 考虑1个cls词元和2个sep词元if len(tokens_a) len(tokens_b) 3 max_len:continuetokens, segments d2l.get_tokens_and_segments(tokens_a, tokens_b)nsp_data_from_paragraph.append((tokens, segments, is_next))return nsp_data_from_paragraph# 生成遮蔽语言模型任务的数据---》将生成的tokens的一部分随机换成masked的tokens
# -》》用于_get_mlm_data_from_tokens函数输入
1、tokens表示BERT输入序列的词元的列表
2、candidate_pred_positions不包括特殊词元的BERT输入序列的词元索引的列表特殊词元在遮蔽语言模型任务中不被预测
3、num_mlm_preds指示预测的数量选择15%要预测的随机词元#save
def _replace_mlm_tokens(tokens, candidate_pred_positions, num_mlm_preds,vocab):# 为遮蔽语言模型的输入创建新的词元副本其中输入可能包含替换的“mask”或随机词元mlm_input_tokens [token for token in tokens]pred_positions_and_labels []# 打乱后用于在遮蔽语言模型任务中获取15%的随机词元进行预测random.shuffle(candidate_pred_positions)for mlm_pred_position in candidate_pred_positions:# 如果生成的预测数量已经超过了最大的预测值 15% 就停止if len(pred_positions_and_labels) num_mlm_preds:breakmasked_token None# 80%的时间将词替换为“mask”词元if random.random() 0.8:masked_token maskelse:# 10%的时间保持词不变if random.random() 0.5:masked_token tokens[mlm_pred_position]# 10%的时间用随机词替换该词else:masked_token random.choice(vocab.idx_to_token)# 将masked的位置填入随机词元或保持不变或maskmlm_input_tokens[mlm_pred_position] masked_tokenpred_positions_and_labels.append((mlm_pred_position, tokens[mlm_pred_position]))return mlm_input_tokens, pred_positions_and_labels
输入BERT输入序列的tokens
输出
1、输入词元的索引【词元已经被masked】
2、发生预测的词元索引
3、发生预测的标签索引当然会有相关的词元会被masked
#save
def _get_mlm_data_from_tokens(tokens, vocab):candidate_pred_positions []# tokens是一个字符串列表for i, token in enumerate(tokens):# 在遮蔽语言模型任务中不会预测特殊词元if token in [cls, sep]:continuecandidate_pred_positions.append(i)# 遮蔽语言模型任务中预测15%的随机词元num_mlm_preds max(1, round(len(tokens) * 0.15))mlm_input_tokens, pred_positions_and_labels _replace_mlm_tokens(tokens, candidate_pred_positions, num_mlm_preds, vocab)pred_positions_and_labels sorted(pred_positions_and_labels,keylambda x: x[0])pred_positions [v[0] for v in pred_positions_and_labels]mlm_pred_labels [v[1] for v in pred_positions_and_labels]return vocab[mlm_input_tokens], pred_positions, vocab[mlm_pred_labels]
将特殊的“mask”词元附加到输入#save
def _pad_bert_inputs(examples, max_len, vocab):max_num_mlm_preds round(max_len * 0.15)all_token_ids, all_segments, valid_lens, [], [], []all_pred_positions, all_mlm_weights, all_mlm_labels [], [], []nsp_labels []for (token_ids, pred_positions, mlm_pred_label_ids, segments,is_next) in examples:# 如果长度不够会加入padall_token_ids.append(torch.tensor(token_ids [vocab[pad]] * (max_len - len(token_ids)), dtypetorch.long))# 而且所有的pad的segments都是0all_segments.append(torch.tensor(segments [0] * (max_len - len(segments)), dtypetorch.long))# valid_lens不包括pad的计数 只是对token_ids计数并不是对all_token_ids计数valid_lens.append(torch.tensor(len(token_ids), dtypetorch.float32))all_pred_positions.append(torch.tensor(pred_positions [0] * (max_num_mlm_preds - len(pred_positions)), dtypetorch.long))# 填充词元的预测将通过乘以0权重在损失中过滤掉all_mlm_weights.append(torch.tensor([1.0] * len(mlm_pred_label_ids) [0.0] * (max_num_mlm_preds - len(pred_positions)),dtypetorch.float32))all_mlm_labels.append(torch.tensor(mlm_pred_label_ids [0] * (max_num_mlm_preds - len(mlm_pred_label_ids)), dtypetorch.long))nsp_labels.append(torch.tensor(is_next, dtypetorch.long))return (all_token_ids, all_segments, valid_lens, all_pred_positions,all_mlm_weights, all_mlm_labels, nsp_labels)#save
class _WikiTextDataset(torch.utils.data.Dataset):def __init__(self, paragraphs, max_len):# 输入paragraphs[i]是代表段落的句子字符串列表# 而输出paragraphs[i]是代表段落的句子列表其中每个句子都是词元列表paragraphs [d2l.tokenize(paragraph, tokenword) for paragraph in paragraphs]sentences [sentence for paragraph in paragraphsfor sentence in paragraph]self.vocab d2l.Vocab(sentences, min_freq5, reserved_tokens[pad, mask, cls, sep])# 获取下一句子预测任务的数据examples []for paragraph in paragraphs:examples.extend(_get_nsp_data_from_paragraph(paragraph, paragraphs, self.vocab, max_len))# 获取遮蔽语言模型任务的数据examples [(_get_mlm_data_from_tokens(tokens, self.vocab) (segments, is_next))for tokens, segments, is_next in examples]# 填充输入(self.all_token_ids, self.all_segments, self.valid_lens,self.all_pred_positions, self.all_mlm_weights,self.all_mlm_labels, self.nsp_labels) _pad_bert_inputs(examples, max_len, self.vocab)def __getitem__(self, idx):return (self.all_token_ids[idx], self.all_segments[idx],self.valid_lens[idx], self.all_pred_positions[idx],self.all_mlm_weights[idx], self.all_mlm_labels[idx],self.nsp_labels[idx])def __len__(self):return len(self.all_token_ids)下载并生成WikiText-2数据集并从中生成预训练样本
#save
def load_data_wiki(batch_size, max_len):加载WikiText-2数据集num_workers d2l.get_dataloader_workers()data_dir d2l.download_extract(wikitext-2, wikitext-2)paragraphs _read_wiki(data_dir)train_set _WikiTextDataset(paragraphs, max_len)train_iter torch.utils.data.DataLoader(train_set, batch_size,shuffleTrue, num_workersnum_workers)return train_iter, train_set.vocab
将批量大小设置为512将BERT输入序列的最大长度设置为64我们打印出小批量的BERT预训练样本的形状。
同时会有(64*0.15)的遮蔽语言模型需要预测的位置
batch_size, max_len 512, 64
train_iter, vocab load_data_wiki(batch_size, max_len)if __name____main__:for (tokens_X, segments_X, valid_lens_x, pred_positions_X, mlm_weights_X,mlm_Y, nsp_y) in train_iter:print(tokens_X.shape, segments_X.shape, valid_lens_x.shape,pred_positions_X.shape, mlm_weights_X.shape, mlm_Y.shape,nsp_y.shape)break