以下是为您补充完整的创建 `Dataset` 和 `DataLoader` 的代码示例: ```python import torch from torch.utils.data import Dataset, DataLoader from torch.nn.utils.rnn import pad_sequence from transformers import BertTokenizer # 假设 vocab 是词表,unk 是未知词标记 class TextDataset(Dataset): def __init__(self, data): # 初始化函数,接受数据作为参数 self.data = data # 将输入文本的句子列表进行分词,并将每个词转换为词表中的 id self.sequences = [[vocab.get(word, vocab.get(unk)) for word in sentence] for sentence in tokenizer(data['text'].values)] def __len__(self): return len(self.sequences) def __getitem__(self, index): return self.sequences[index] def collate_fn(batch): # 对句子进行填充,使得每个 batch 内句子长度相同 return pad_sequence([torch.tensor(seq) for seq in batch], batch_first=True) # 示例数据 data = {'text': ['This is a sample sentence.', 'Another example here.']} tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # 创建数据集实例 dataset = TextDataset(data) # 设置 batch size 大小,创建数据加载器实例 batch_size = 32 dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn) ``` [2024-11-03 11:46:00 | AI写代码神器 | 338点数解答]