# 安装 transformers、datasets 和 huggingface_hub 库（使用阿里云镜像源加速）

!pip install transformers -i https://mirrors.aliyun.com/pypi/simple/
!pip install datasets -i https://mirrors.aliyun.com/pypi/simple/
!pip install huggingface_hub -i https://mirrors.aliyun.com/pypi/simple/

import warnings
warnings.filterwarnings('ignore')
import transformers
import datasets
import huggingface_hub

transformers.__version__, datasets.__version__, huggingface_hub.__version__

('4.14.1', '2.19.1', '0.24.6')

HF_ENDPOINT="https://hf-mirror.com" hf download google-bert/bert-base-chinese
```

from transformers import AutoTokenizer

# 加载 Hugging Face 的 AutoTokenizer，用于分词和编码
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

tokenizer

PreTrainedTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

#简单编码
data = tokenizer.encode('你好,你好吗?')
data

[101, 872, 1962, 117, 872, 1962, 1408, 136, 102]

#解码
tokenizer.decode(data, skip_special_tokens=False)

'[CLS] 你 好, 你 好 吗? [SEP]'

# text 是主句（第一个句子），
# text_pair 是与主句配对的第二个句子，用于句子对任务（如句子关系推断、问答等）
data = tokenizer(
    #句子的前半部分
    text=['第一个句子', '第二个句子'],
    #句子的后半部分,单句子编码时不用传递
    text_pair=['第三个句子', '第四个更长一点的句子'],
    #是否要添加特殊符号
    add_special_tokens=True,
    #是否补长到统一长度,一般定义为True或者'max_length'
    padding=True,
    #句子长度超过max_length时是否裁剪,一般定义为True
    truncation=True,
    max_length=20, 
    #编码数据的格式,一般定义为'pt','np','tf'默认是list
    return_tensors='np',
)
# input_ids：每个 token（词或字）的编号
# token_type_ids：区分句子对任务中的主句和配对句。0 表示第一个句子，1 表示第二个句子，padding 部分可能为 0。
# attention_mask：标记哪些位置是有效 token（1），哪些是 padding（0），用于模型忽略填充部分。
data

{'input_ids': array([[ 101, 5018,  671,  702, 1368, 2094,  102, 5018,  676,  702, 1368,
        2094,  102,    0,    0,    0,    0,    0],
       [ 101, 5018,  753,  702, 1368, 2094,  102, 5018, 1724,  702, 3291,
        7270,  671, 4157, 4638, 1368, 2094,  102]]), 'token_type_ids': array([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

#批量解码
tokenizer.batch_decode(data.input_ids, skip_special_tokens=False)

['[CLS] 第 一 个 句 子 [SEP] 第 三 个 句 子 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] 第 二 个 句 子 [SEP] 第 四 个 更 长 一 点 的 句 子 [SEP]']

#编码结果
data.input_ids

array([[ 101, 5018,  671,  702, 1368, 2094,  102, 5018,  676,  702, 1368,
        2094,  102,    0,    0,    0,    0,    0],
       [ 101, 5018,  753,  702, 1368, 2094,  102, 5018, 1724,  702, 3291,
        7270,  671, 4157, 4638, 1368, 2094,  102]])

# 在句子对任务中，token_type_ids 会自动为每个句子分配不同的 segment id
# 前一句（text）对应的 token_type_ids 为 0，后一句（text_pair）对应的 token_type_ids 为 1
# 如果句子长度不同，padding 部分也会分配 segment id
# 你可以通过 batch_decode(data.input_ids) 对比每个 token 的实际内容和 token_type_ids
for input_ids, token_type_ids in zip(data.input_ids, data.token_type_ids):
    tokens = tokenizer.batch_decode([input_ids], skip_special_tokens=False)[0]
    print("Tokens:", tokens)
    print("Token Type IDs:", token_type_ids)
    print("=" * 30)
data.token_type_ids

array([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

#标记哪些位置是pad
data.attention_mask

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

from datasets import load_dataset
#可用的数据集:https://huggingface.co/datasets
#在线加载一个数据集
dataset = load_dataset(path='lansinuote/ChnSentiCorp', name=None, split=None)
dataset, dataset['train'][0]

Generating train split: 100%|██████████| 9600/9600 [00:00<00:00, 643596.35 examples/s]
Generating validation split: 100%|██████████| 1200/1200 [00:00<00:00, 583555.34 examples/s]
Generating test split: 100%|██████████| 1200/1200 [00:00<00:00, 719846.22 examples/s]

(DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 9600
     })
     validation: Dataset({
         features: ['text', 'label'],
         num_rows: 1200
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 1200
     })
 }),
 {'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  'label': 1})

#保存到本地
dataset.save_to_disk('dataset/lansinuote/ChnSentiCorp')

Saving the dataset (1/1 shards): 100%|██████████| 9600/9600 [00:00<00:00, 691641.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1200/1200 [00:00<00:00, 329935.42 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1200/1200 [00:00<00:00, 428792.37 examples/s]

from datasets import load_from_disk
#从本地加载
dataset = load_from_disk('dataset/lansinuote/ChnSentiCorp')
dataset, dataset['train'][0]

(DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 9600
     })
     validation: Dataset({
         features: ['text', 'label'],
         num_rows: 1200
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 1200
     })
 }),
 {'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  'label': 1})

dataset = load_from_disk('dataset/lansinuote/ChnSentiCorp')
#遍历每一条数据并进行处理
def f(data):
    #新增字段
    data['new_column'] = data['text'][:5]
    #删除字段
    del data['label']
    #修改字段
    data['text'] = 'prefix_' + data['text']
    return data

dataset = dataset.map(f)
dataset, dataset['train'][0]

Map: 100%|██████████| 9600/9600 [00:00<00:00, 52173.98 examples/s]
Map: 100%|██████████| 1200/1200 [00:00<00:00, 54182.39 examples/s]
Map: 100%|██████████| 1200/1200 [00:00<00:00, 54164.90 examples/s]

(DatasetDict({
     train: Dataset({
         features: ['text', 'new_column'],
         num_rows: 9600
     })
     validation: Dataset({
         features: ['text', 'new_column'],
         num_rows: 1200
     })
     test: Dataset({
         features: ['text', 'new_column'],
         num_rows: 1200
     })
 }),
 {'text': 'prefix_选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  'new_column': '选择珠江花'})

dataset = load_from_disk('dataset/lansinuote/ChnSentiCorp')
#过滤数据集
f = lambda x: x['label'] == 1
dataset = dataset.filter(f)
dataset, dataset['train'][0]

Filter: 100%|██████████| 9600/9600 [00:00<00:00, 323598.15 examples/s]
Filter: 100%|██████████| 1200/1200 [00:00<00:00, 348967.95 examples/s]
Filter: 100%|██████████| 1200/1200 [00:00<00:00, 353179.76 examples/s]

(DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 4799
     })
     validation: Dataset({
         features: ['text', 'label'],
         num_rows: 593
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 608
     })
 }),
 {'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  'label': 1})

dataset = load_from_disk('dataset/lansinuote/ChnSentiCorp')
#map和filter时可以使用批量处理,同时使用多线程并行处理
#在某些环境下使用多线程会卡住,出现这种情况时请切换到单线程,num_proc=1
def f(data):
    data['new_column'] = [i[:5] for i in data['text']]
    return data

dataset = dataset.map(f, batched=True, batch_size=5, num_proc=2)
dataset, dataset['train'][0]

Map (num_proc=2): 100%|██████████| 9600/9600 [00:00<00:00, 23257.38 examples/s]
Map (num_proc=2): 100%|██████████| 1200/1200 [00:00<00:00, 8713.55 examples/s]
Map (num_proc=2): 100%|██████████| 1200/1200 [00:00<00:00, 9443.48 examples/s]

(DatasetDict({
     train: Dataset({
         features: ['text', 'label', 'new_column'],
         num_rows: 9600
     })
     validation: Dataset({
         features: ['text', 'label', 'new_column'],
         num_rows: 1200
     })
     test: Dataset({
         features: ['text', 'label', 'new_column'],
         num_rows: 1200
     })
 }),
 {'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  'label': 1,
  'new_column': '选择珠江花'})

dataset = load_from_disk('dataset/lansinuote/ChnSentiCorp')
#删除字段
dataset = dataset.remove_columns(['label'])

dataset, dataset['train'][0]

(DatasetDict({
     train: Dataset({
         features: ['text'],
         num_rows: 9600
     })
     validation: Dataset({
         features: ['text'],
         num_rows: 1200
     })
     test: Dataset({
         features: ['text'],
         num_rows: 1200
     })
 }),
 {'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般'})

dataset = load_from_disk('dataset/lansinuote/ChnSentiCorp')
#重命名字段
dataset = dataset.rename_columns({'text': 'new_text', 'label': 'new_label'})

dataset, dataset['train'][0]

(DatasetDict({
     train: Dataset({
         features: ['new_text', 'new_label'],
         num_rows: 9600
     })
     validation: Dataset({
         features: ['new_text', 'new_label'],
         num_rows: 1200
     })
     test: Dataset({
         features: ['new_text', 'new_label'],
         num_rows: 1200
     })
 }),
 {'new_text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  'new_label': 1})

dataset = load_from_disk('dataset/lansinuote/ChnSentiCorp')
#设置字段的数据类型,可选的有np,pt,tf
# 可选的数据类型有 'numpy' (np), 'torch' (pt), 'tensorflow' (tf)
print("可选的数据类型: 'np' (numpy), 'pt' (torch), 'tf' (tensorflow)")
dataset.set_format('pt', columns=['label'], output_all_columns=True)

dataset, dataset['train'][0]

(DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 9600
     })
     validation: Dataset({
         features: ['text', 'label'],
         num_rows: 1200
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 1200
     })
 }),
 {'label': tensor(1),
  'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般'})

from datasets import concatenate_datasets

dataset = load_from_disk('dataset/lansinuote/ChnSentiCorp')
#合并多个数据集
dataset = concatenate_datasets(list(dataset.values()))
dataset, dataset[0]

(Dataset({
     features: ['text', 'label'],
     num_rows: 12000
 }),
 {'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  'label': 1})

dataset = load_from_disk('dataset/lansinuote/ChnSentiCorp')

#切分一个数据集为训练集和测试,可以指定比例,也可以直接指定数量
dataset = dataset['train'].train_test_split(test_size=0.1, train_size=8640)
dataset, dataset['train'][0]

(DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 8640
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 960
     })
 }),
 {'text': '香港的酒店在面積方面就不需要強求什麼了﹗這家酒店設施已經比較陳舊了，不過交通比較方便的﹗服務方面尚可﹗', 'label': 1})

dataset = load_from_disk('dataset/lansinuote/ChnSentiCorp')

#取数据集中的某些数据
dataset = dataset['train'].select([5, 15, 20, 50])
dataset, dataset[0]

(Dataset({
     features: ['text', 'label'],
     num_rows: 4
 }),
 {'text': '机器背面似乎被撕了张什么标签，残胶还在。但是又看不出是什么标签不见了，该有的都在，怪', 'label': 0})

from datasets import Dataset

#从一个字典创建数据集
dataset = {'name': ['小王', '小李'], 'age': [12, 16], 'teacher': ['张老师', '马老师']}
dataset = Dataset.from_dict(dataset)
dataset, dataset[0]

(Dataset({
     features: ['name', 'age', 'teacher'],
     num_rows: 2
 }),
 {'name': '小王', 'age': 12, 'teacher': '张老师'})

#使用一个函数生成数据集
def f():
    yield {'name': '小王', 'age': 12, 'teacher': '张老师'}
    yield {'name': '小李', 'age': 16, 'teacher': '马老师'}


dataset = Dataset.from_generator(f)

dataset, dataset[0]

Generating train split: 2 examples [00:00, 888.34 examples/s]

(Dataset({
     features: ['name', 'age', 'teacher'],
     num_rows: 2
 }),
 {'name': '小王', 'age': 12, 'teacher': '张老师'})

from datasets import load_dataset

#保存一个数据集为csv格式
dataset.to_csv('dataset/sample.csv')
#从csv文件创建数据集
dataset = load_dataset('csv', data_files='dataset/sample.csv', split='train')
dataset, dataset[0]

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 182.12ba/s]
Generating train split: 2 examples [00:00, 302.11 examples/s]

(Dataset({
     features: ['name', 'age', 'teacher'],
     num_rows: 2
 }),
 {'name': '小王', 'age': 12, 'teacher': '张老师'})

from datasets import load_dataset

#保存一个数据集为json格式
dataset.to_json('dataset/sample.json')
#从json文件创建数据集
# 从json文件创建数据集，split='train' 表示只加载数据集中的 'train' 切分部分
dataset = load_dataset('json', data_files='dataset/sample.json', split='train')

dataset, dataset[0]

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 417.05ba/s]
Generating train split: 2 examples [00:00, 202.25 examples/s]

(Dataset({
     features: ['name', 'age', 'teacher'],
     num_rows: 2
 }),
 {'name': '小王', 'age': 12, 'teacher': '张老师'})

import torch
from transformers import BertConfig, BertModel
#使用配置文件创建一个bert模型
# vocab_size: 词表大小，决定模型能处理多少不同的词（token）
# num_hidden_layers: Transformer编码器的层数，层数越多模型越深，表达能力越强
config = BertConfig(vocab_size=15000, num_hidden_layers=4)
model = BertModel(config)
#使用该模型进行试算,输入数据是4句话,每句话125个词
input = {
    'input_ids': torch.randint(100, 10000, [4, 125]),
    'attention_mask': torch.ones(4, 125).long()
}
with torch.no_grad():
    out = model(**input)
#计算结果是把这4句话向量化了
#可以基于这些向量做各种下游任务
config, out.last_hidden_state.shape

(BertConfig {
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "initializer_range": 0.02,
   "intermediate_size": 3072,
   "layer_norm_eps": 1e-12,
   "max_position_embeddings": 512,
   "model_type": "bert",
   "num_attention_heads": 12,
   "num_hidden_layers": 4,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "transformers_version": "4.14.1",
   "type_vocab_size": 2,
   "use_cache": true,
   "vocab_size": 15000
 },
 torch.Size([4, 125, 768]))

from transformers import GPT2Config, GPT2Model
#使用配置文件创建一个gpt2模型
config = GPT2Config(vocab_size=15000, n_layer=4)
model = GPT2Model(config)
#执行试算
with torch.no_grad():
    out = model(**input)
config, out.last_hidden_state.shape

(GPT2Config {
   "activation_function": "gelu_new",
   "attn_pdrop": 0.1,
   "bos_token_id": 50256,
   "embd_pdrop": 0.1,
   "eos_token_id": 50256,
   "initializer_range": 0.02,
   "layer_norm_epsilon": 1e-05,
   "model_type": "gpt2",
   "n_embd": 768,
   "n_head": 12,
   "n_inner": null,
   "n_layer": 4,
   "n_positions": 1024,
   "reorder_and_upcast_attn": false,
   "resid_pdrop": 0.1,
   "scale_attn_by_inverse_layer_idx": false,
   "scale_attn_weights": true,
   "summary_activation": null,
   "summary_first_dropout": 0.1,
   "summary_proj_to_labels": true,
   "summary_type": "cls_index",
   "summary_use_proj": true,
   "transformers_version": "4.14.1",
   "use_cache": true,
   "vocab_size": 15000
 },
 torch.Size([4, 125, 768]))

from transformers import BertConfig, BertForSequenceClassification
#直接创建一个语句分类模型
config = BertConfig(vocab_size=15000, num_hidden_layers=4, num_labels=3)
model = BertForSequenceClassification(config)
#执行试算,参数中包括labels,可以直接计算loss
input_with_labels = {
    'input_ids': torch.randint(100, 10000, [4, 125]),
    'attention_mask': torch.ones(4, 125).long(),
    'labels': torch.ones(4).long()
}
with torch.no_grad():
    out = model(**input_with_labels)
config, out.loss, out.logits.shape

(BertConfig {
   "attention_probs_dropout_prob": 0.1,
   "classifier_dropout": null,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "id2label": {
     "0": "LABEL_0",
     "1": "LABEL_1",
     "2": "LABEL_2"
   },
   "initializer_range": 0.02,
   "intermediate_size": 3072,
   "label2id": {
     "LABEL_0": 0,
     "LABEL_1": 1,
     "LABEL_2": 2
   },
   "layer_norm_eps": 1e-12,
   "max_position_embeddings": 512,
   "model_type": "bert",
   "num_attention_heads": 12,
   "num_hidden_layers": 4,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "problem_type": "single_label_classification",
   "transformers_version": "4.14.1",
   "type_vocab_size": 2,
   "use_cache": true,
   "vocab_size": 15000
 },
 tensor(1.5939),
 torch.Size([4, 3]))

from transformers import AutoModel

#可用的模型:https://huggingface.co/models
#在线加载一个预训练模型
model = AutoModel.from_pretrained('google-bert/bert-base-chinese')
#执行试算
with torch.no_grad():
    out = model(**input)

out.last_hidden_state.shape

Some weights of the model checkpoint at google-bert/bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

torch.Size([4, 125, 768])

#保存一个模型到本地磁盘
model.save_pretrained('model/google-bert/bert-base-chinese')

#从本地磁盘加载模型
model = AutoModel.from_pretrained('model/google-bert/bert-base-chinese')

import torch
from transformers import AutoTokenizer
#加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

tokenizer

PreTrainedTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

from datasets import load_dataset

#加载数据集
dataset = load_dataset(path='lansinuote/ChnSentiCorp')
dataset, dataset['train'][0]

(DatasetDict({
     train: Dataset({
         features: ['text', 'label'],
         num_rows: 9600
     })
     validation: Dataset({
         features: ['text', 'label'],
         num_rows: 1200
     })
     test: Dataset({
         features: ['text', 'label'],
         num_rows: 1200
     })
 }),
 {'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  'label': 1})

#定义数据集遍历工具
def collate_fn(data):
    text = [i['text'] for i in data]
    label = [i['label'] for i in data]
    #文字编码
    data = tokenizer(text,
                     padding=True,
                     truncation=True,
                     max_length=500,
                     return_tensors='pt',
                     return_token_type_ids=False)
    #设置label
    data['label'] = torch.LongTensor(label)
    return data

loader = torch.utils.data.DataLoader(dataset['train'],
                                     batch_size=8,
                                     shuffle=True,
                                     drop_last=True,
                                     collate_fn=collate_fn)
data = next(iter(loader))

for k, v in data.items():
    print(k, v.shape)

len(loader)

input_ids torch.Size([8, 180])
attention_mask torch.Size([8, 180])
label torch.Size([8])

1200

#定义模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        #加载预训练模型
        from transformers import AutoModel
        self.pretrained = AutoModel.from_pretrained(
            'google-bert/bert-base-chinese')
        self.fc = torch.nn.Linear(in_features=768, out_features=2)

    def forward(self, input_ids, attention_mask, label=None):
        #使用预训练模型抽取数据特征
        with torch.no_grad():
            last_hidden_state = self.pretrained(
                input_ids=input_ids,
                attention_mask=attention_mask).last_hidden_state
        #只取第0个词的特征做分类,这和bert模型的训练方式有关,此处不展开
        last_hidden_state = last_hidden_state[:, 0]
        #对抽取的特征只取第一个字的结果做分类即可
        out = self.fc(last_hidden_state).softmax(dim=1)
        #计算loss
        loss = None
        if label is not None:
            loss = torch.nn.functional.cross_entropy(out, label)
        return loss, out

model = Model()
model(**data)

Some weights of the model checkpoint at google-bert/bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

(tensor(0.6389, grad_fn=<NllLossBackward0>),
 tensor([[0.5524, 0.4476],
         [0.6696, 0.3304],
         [0.6045, 0.3955],
         [0.5380, 0.4620],
         [0.6753, 0.3247],
         [0.6832, 0.3168],
         [0.6316, 0.3684],
         [0.6198, 0.3802]], grad_fn=<SoftmaxBackward0>))

#执行训练
def train():
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    for i, data in enumerate(loader):
        loss, out = model(**data)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            out = out.argmax(dim=1)
            acc = (out == data.label).sum().item() / len(data.label)
            print(i, len(loader), loss.item(), acc)
        if i == 300:
            break
train()

0 1200 0.7262997031211853 0.375
10 1200 0.7581859827041626 0.375
20 1200 0.6871466636657715 0.5
30 1200 0.6931240558624268 0.5
40 1200 0.7119981050491333 0.5
50 1200 0.6597883701324463 0.625
60 1200 0.6511260867118835 0.625
70 1200 0.6154073476791382 0.875
80 1200 0.7182807326316833 0.375
90 1200 0.6051954627037048 0.75
100 1200 0.6379399299621582 0.625
110 1200 0.598688006401062 1.0
120 1200 0.5821105241775513 0.875
130 1200 0.6529800295829773 0.625
140 1200 0.5931781530380249 0.75
150 1200 0.6591622233390808 0.625
160 1200 0.6015957593917847 0.875
170 1200 0.5175269842147827 1.0
180 1200 0.5152828693389893 0.875
190 1200 0.5312021970748901 1.0
200 1200 0.5849854350090027 0.875
210 1200 0.4888814091682434 1.0
220 1200 0.6180499792098999 0.75
230 1200 0.4803149998188019 1.0
240 1200 0.6068854331970215 0.75
250 1200 0.5401108264923096 0.875
260 1200 0.6038746237754822 0.75
270 1200 0.5762675404548645 0.75
280 1200 0.5491940975189209 0.875
290 1200 0.5917487144470215 0.75
300 1200 0.5924007296562195 0.75

#执行测试
def test():
    loader_test = torch.utils.data.DataLoader(dataset['test'],
                                              batch_size=8,
                                              shuffle=True,
                                              drop_last=True,
                                              collate_fn=collate_fn)
    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        with torch.no_grad():
            _, out = model(**data)
        out = out.argmax(dim=1)
        correct += (out == data.label).sum().item()
        total += len(data.label)
        print(i, len(loader_test), correct / total)
        if i == 5:
            break
    return correct / total

test()

0 150 1.0
1 150 0.9375
2 150 0.875
3 150 0.875
4 150 0.85
5 150 0.8333333333333334

0.8333333333333334

import torch
from transformers import AutoTokenizer
#加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
tokenizer

PreTrainedTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

from datasets import load_dataset
#加载数据集
dataset = load_dataset(path='lansinuote/ChnSentiCorp')

#编码
f = lambda x: tokenizer(
    x['text'], truncation=True, max_length=30, return_token_type_ids=False)
dataset = dataset.map(f, remove_columns=['text', 'label'])
#过滤句子长度
f = lambda x: len(x['input_ids']) >= 30
dataset = dataset.filter(f)

#重置label字段
def f(data):
    #定义第15个字为label
    data['label'] = data['input_ids'][15]
    #替换句子中的第15个字为mask
    data['input_ids'][15] = tokenizer.mask_token_id
    return data

dataset = dataset.map(f)
#设置数据类型
dataset.set_format('pt')
dataset, dataset['train'][0]

Map: 100%|██████████| 9600/9600 [00:02<00:00, 4680.91 examples/s]
Map: 100%|██████████| 1200/1200 [00:00<00:00, 4670.47 examples/s]
Map: 100%|██████████| 1200/1200 [00:00<00:00, 4724.80 examples/s]
Filter: 100%|██████████| 9600/9600 [00:00<00:00, 78205.43 examples/s]
Filter: 100%|██████████| 1200/1200 [00:00<00:00, 72664.29 examples/s]
Filter: 100%|██████████| 1200/1200 [00:00<00:00, 70590.38 examples/s]
Map: 100%|██████████| 9286/9286 [00:00<00:00, 19875.17 examples/s]
Map: 100%|██████████| 1158/1158 [00:00<00:00, 19889.13 examples/s]
Map: 100%|██████████| 1157/1157 [00:00<00:00, 19947.26 examples/s]

(DatasetDict({
     train: Dataset({
         features: ['input_ids', 'attention_mask', 'label'],
         num_rows: 9286
     })
     validation: Dataset({
         features: ['input_ids', 'attention_mask', 'label'],
         num_rows: 1158
     })
     test: Dataset({
         features: ['input_ids', 'attention_mask', 'label'],
         num_rows: 1157
     })
 }),
 {'input_ids': tensor([ 101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221,
          3175,  912, 8024,  103, 4510, 1220, 2820, 3461, 4684, 2970, 1168, 6809,
          3862, 6804, 8024, 1453, 1741,  102]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1]),
  'label': tensor(3300)})

loader = torch.utils.data.DataLoader(dataset['train'],
                                     batch_size=8,
                                     shuffle=True,
                                     drop_last=True)
data = next(iter(loader))
for k, v in data.items():
    print(k, v.shape)

len(loader)

input_ids torch.Size([8, 30])
attention_mask torch.Size([8, 30])
label torch.Size([8])

1160

#查看数据样例
for q, a in zip(data['input_ids'], data['label']):
    print(tokenizer.decode(q))
    print(tokenizer.decode(a))
    print('==============')

[CLS] 初 为 父 母 ， 我 们 对 育 儿 一 无 所 知 [MASK] 在 父 辈 那 里 获 得 的 是 些 传 统 的 [SEP]
，
==============
[CLS] 上 周 去 住 了 两 晚, 本 来 想 住 鳶 飞 [MASK] 订 的 时 候 没 房 了, 携 程 给 推 荐 [SEP]
,
==============
[CLS] 等 了 好 几 天 终 于 拿 到 了 ， 配 置 和 [MASK] 牌 来 说 ， 都 令 人 满 意 。 商 务 个 [SEP]
品
==============
[CLS] 书 本 是 做 为 送 给 宝 贝 儿 子 的 礼 物 [MASK], 但 订 了 六 天 还 看 到, 心 情 都 [SEP]
的
==============
[CLS] 特 别 轻 薄 便 携 ， 磨 砂 手 感 ， 非 常 [MASK] 服 的 全 尺 寸 键 盘 ， 功 能 和 接 口 [SEP]
舒
==============
[CLS] 价 格 不 高, 比 较 实 惠, 服 务 也 不 [MASK], 离 闹 市 区 不 远. 交 通 也 比 较 [SEP]
错
==============
[CLS] 做 工 不 错 ， 主 要 事 价 钱 实 在 ， [UNK] [MASK] 看 了 网 上 的 笔 记 本 显 卡 排 行 ， [SEP]
，
==============
[CLS] 商 品 本 身 就 是 有 点 划 花 ， 但 是 可 [MASK] 忍 受 ， 但 是 京 东 的 配 送 实 在 是 [SEP]
以
==============

#定义模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        #加载预训练模型
        from transformers import AutoModel
        self.pretrained = AutoModel.from_pretrained(
            'google-bert/bert-base-chinese')

        self.fc = torch.nn.Linear(in_features=768,
                                  out_features=tokenizer.vocab_size)

    def forward(self, input_ids, attention_mask, label=None):
        #使用预训练模型抽取数据特征
        with torch.no_grad():
            last_hidden_state = self.pretrained(
                input_ids=input_ids,
                attention_mask=attention_mask).last_hidden_state
        #取第15个词的特征向量
        last_hidden_state = last_hidden_state[:, 15]
        #对抽取的特征只取第一个字的结果做分类即可
        out = self.fc(last_hidden_state).softmax(dim=1)
        #计算loss
        loss = None
        if label is not None:
            loss = torch.nn.functional.cross_entropy(out, label)
        return loss, out

model = Model()
model(**data)

Some weights of the model checkpoint at google-bert/bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

(tensor(9.9583, grad_fn=<NllLossBackward0>),
 tensor([[5.0529e-05, 3.1674e-05, 2.6319e-05,  ..., 2.6106e-05, 2.4358e-05,
          3.1566e-05],
         [5.0705e-05, 3.8273e-05, 3.1873e-05,  ..., 2.1011e-05, 1.7329e-05,
          3.7357e-05],
         [2.7761e-05, 2.2192e-05, 2.5703e-05,  ..., 3.1622e-05, 2.1487e-05,
          3.1346e-05],
         ...,
         [3.3094e-05, 3.0744e-05, 4.1614e-05,  ..., 4.1591e-05, 3.7942e-05,
          3.1904e-05],
         [5.1043e-05, 2.5199e-05, 3.2904e-05,  ..., 2.5904e-05, 2.2800e-05,
          4.4192e-05],
         [2.5545e-05, 3.9304e-05, 3.4576e-05,  ..., 3.1862e-05, 3.3781e-05,
          3.6098e-05]], grad_fn=<SoftmaxBackward0>))

#执行训练
def train():
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    for epoch in range(5):
        for i, data in enumerate(loader):
            loss, out = model(**data)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            if i % 200 == 0:
                out = out.argmax(dim=1)
                acc = (out == data['label']).sum().item() / len(data['label'])
                print(epoch, i, len(loader), loss.item(), acc)
train()

0 0 1160 9.958352088928223 0.0
0 200 1160 9.958297729492188 0.0
0 400 1160 9.9579439163208 0.0
0 600 1160 9.839292526245117 0.125
0 800 1160 9.835004806518555 0.125
0 1000 1160 9.736844062805176 0.25
1 0 1160 9.698297500610352 0.375
1 200 1160 9.482852935791016 0.5
1 400 1160 9.632692337036133 0.375
1 600 1160 9.69746208190918 0.375
1 800 1160 9.713400840759277 0.25
1 1000 1160 9.5994291305542 0.5
2 0 1160 9.819761276245117 0.25
2 200 1160 9.2111234664917 0.75
2 400 1160 9.57249927520752 0.375
2 600 1160 9.501824378967285 0.5
2 800 1160 9.71265983581543 0.25
2 1000 1160 9.492500305175781 0.5
3 0 1160 9.67277717590332 0.375
3 200 1160 9.585151672363281 0.375
3 400 1160 9.637388229370117 0.375
3 600 1160 9.629745483398438 0.375
3 800 1160 9.472501754760742 0.5
3 1000 1160 9.59573745727539 0.375
4 0 1160 9.75395393371582 0.25
4 200 1160 9.34372615814209 0.625
4 400 1160 9.506635665893555 0.5
4 600 1160 9.679558753967285 0.25
4 800 1160 9.76767349243164 0.25
4 1000 1160 9.751357078552246 0.25

#执行测试
def test():
    loader_test = torch.utils.data.DataLoader(dataset['test'],
                                              batch_size=8,
                                              shuffle=True,
                                              drop_last=True)
    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        with torch.no_grad():
            _, out = model(**data)
        out = out.argmax(dim=1)
        correct += (out == data['label']).sum().item()
        total += len(data['label'])
        print(i, len(loader_test), correct / total)
        if i == 5:
            break
    return correct / total

test()

0 144 0.375
1 144 0.4375
2 144 0.4166666666666667
3 144 0.4375
4 144 0.45
5 144 0.4375

0.4375

import torch
import random

from transformers import AutoTokenizer
#加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
tokenizer

PreTrainedTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

from datasets import load_dataset

#加载数据集
dataset = load_dataset(path='lansinuote/ChnSentiCorp')
#过滤句子长度
f = lambda x: len(x['text']) >= 40
dataset = dataset.filter(f)
#移除多余的字段
dataset = dataset.remove_columns(['label'])
dataset, dataset['train'][0]

Filter: 100%|██████████| 9600/9600 [00:00<00:00, 516811.73 examples/s]
Filter: 100%|██████████| 1200/1200 [00:00<00:00, 328685.74 examples/s]
Filter: 100%|██████████| 1200/1200 [00:00<00:00, 336891.89 examples/s]

(DatasetDict({
     train: Dataset({
         features: ['text'],
         num_rows: 8130
     })
     validation: Dataset({
         features: ['text'],
         num_rows: 1032
     })
     test: Dataset({
         features: ['text'],
         num_rows: 1011
     })
 }),
 {'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般'})

#定义数据集遍历工具
def collate_fn(data):
    b = len(data)
    text = [i['text'] for i in data]
    #生成前后两段话分别的索引
    s1 = list(range(b))
    s2 = list(range(b))
    random.shuffle(s2)
    #根据索引生成label,表明两句话是否是前后相连的关系
    label = [s1[i] == s2[i] for i in range(b)]
    #取出具体的文字
    s1 = [text[i][0:20] for i in s1]
    s2 = [text[i][20:40] for i in s2]
    #句子对编码
    data = tokenizer(s1,
                     s2,
                     padding=True,
                     truncation=True,
                     max_length=50,
                     return_tensors='pt')
    #设置label
    data['label'] = torch.LongTensor(label)
    return data

loader = torch.utils.data.DataLoader(dataset['train'],
                                     batch_size=4,
                                     shuffle=True,
                                     drop_last=True,
                                     collate_fn=collate_fn)

data = next(iter(loader))
for k, v in data.items():
    print(k, v.shape)

len(loader)

input_ids torch.Size([4, 43])
token_type_ids torch.Size([4, 43])
attention_mask torch.Size([4, 43])
label torch.Size([4])

2032

#查看数据样例
for input_ids, label in zip(data['input_ids'], data['label']):
    print(tokenizer.decode(input_ids))
    print(label)
    print('================')

[CLS] 系 统 实 在 太 难 装 了 ， 我 是 先 用 diskgen [SEP] 子 有 点 仿 sony 的 ， 摄 像 头 和 麦 克 风 都 有 ， [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
tensor(0)
================
[CLS] 内 存 用 的 不 错 ， 居 然 是 [UNK] ddr3 的 ， 样 [SEP] 太 好 - - 温 度 不 好 调 节 以 及 噪 音 不 小 ， 淋 浴 水 [SEP] [PAD] [PAD] [PAD] [PAD]
tensor(0)
================
[CLS] 房 间 还 可 以 （ 我 住 的 是 二 十 一 层 ） ， 但 空 调 不 [SEP] 服 只 会 推 托 ， 只 会 要 求 用 户 再 下 订 单 。 如 此 服 [SEP]
tensor(0)
================
[CLS] 当 当 网 名 不 符 实 ， 订 货 多 日 不 见 送 货 ， 询 问 客 [SEP] 、 winpe 分 区 ， 然 后 用 逐 步 安 装 的 [UNK] 盘 [SEP] [PAD] [PAD] [PAD] [PAD]
tensor(0)
================

#定义模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        #加载预训练模型
        from transformers import AutoModel
        self.pretrained = AutoModel.from_pretrained(
            'google-bert/bert-base-chinese')
        self.fc = torch.nn.Linear(in_features=768, out_features=2)

    def forward(self, input_ids, attention_mask, token_type_ids, label=None):
        #使用预训练模型抽取数据特征
        with torch.no_grad():
            last_hidden_state = self.pretrained(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids).last_hidden_state
        #只取第0个词的特征做分类,这和bert模型的训练方式有关,此处不展开
        last_hidden_state = last_hidden_state[:, 0]
        #对抽取的特征只取第一个字的结果做分类即可
        out = self.fc(last_hidden_state).softmax(dim=1)
        #计算loss
        loss = None
        if label is not None:
            loss = torch.nn.functional.cross_entropy(out, label)
        return loss, out

model = Model()
model(**data)

Some weights of the model checkpoint at google-bert/bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

(tensor(0.7496, grad_fn=<NllLossBackward0>),
 tensor([[0.4863, 0.5137],
         [0.4105, 0.5895],
         [0.3739, 0.6261],
         [0.5156, 0.4844]], grad_fn=<SoftmaxBackward0>))

# 手动实现了模型训练的基本流程：前向传播、计算损失、反向传播、参数更新和指标打印。
def train():
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    for i, data in enumerate(loader):
        loss, out = model(**data)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            out = out.argmax(dim=1)
            acc = (out == data.label).sum().item() / len(data.label)
            print(i, len(loader), loss.item(), acc)
        if i == 300:
            break
train()

0 2032 0.6228665113449097 1.0
10 2032 0.6385748386383057 0.75
20 2032 0.5630174875259399 1.0
30 2032 0.6211380362510681 0.75
40 2032 0.6286014318466187 0.75
50 2032 0.604794979095459 1.0
60 2032 0.5244653820991516 1.0
70 2032 0.6693727374076843 0.5
80 2032 0.5640286207199097 0.5
90 2032 0.564754843711853 0.75
100 2032 0.4135267436504364 1.0
110 2032 0.5619957447052002 0.75
120 2032 0.38237079977989197 1.0
130 2032 0.4575287699699402 1.0
140 2032 0.5330777168273926 0.75
150 2032 0.5029939413070679 0.75
160 2032 0.41439491510391235 1.0
170 2032 0.4616542160511017 0.75
180 2032 0.3348842263221741 1.0
190 2032 0.4226081967353821 1.0
200 2032 0.39427024126052856 1.0
210 2032 0.4740626811981201 1.0
220 2032 0.5943247675895691 0.75
230 2032 0.5860543847084045 0.5
240 2032 0.3445925712585449 1.0
250 2032 0.38528740406036377 1.0
260 2032 0.5312015414237976 0.75
270 2032 0.4994317889213562 0.75
280 2032 0.5469095706939697 0.75
290 2032 0.3617188334465027 1.0
300 2032 0.771730899810791 0.5

#执行测试
def test():
    loader_test = torch.utils.data.DataLoader(dataset['test'],
                                              batch_size=4,
                                              shuffle=True,
                                              drop_last=True,
                                              collate_fn=collate_fn)
    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        with torch.no_grad():
            _, out = model(**data)
        out = out.argmax(dim=1)
        correct += (out == data.label).sum().item()
        total += len(data.label)
        print(i, len(loader_test), correct / total)
        if i == 5:
            break
    return correct / total

test()

0 252 0.75
1 252 0.75
2 252 0.75
3 252 0.75
4 252 0.8
5 252 0.8333333333333334

0.8333333333333334

# Hugging Face 的 Trainer 是一个高级训练工具，简化了模型训练、评估和保存的流程。
# 它支持分布式训练、自动评估、模型保存、日志记录等功能，适用于大多数 NLP/CV 任务。
# 只需定义模型、数据集和训练参数，即可快速启动训练，无需手动编写训练循环。
# Trainer 支持自定义数据处理、评价指标、回调等，适合原型开发和工业应用。

import torch
from transformers import AutoTokenizer
#加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
tokenizer

PreTrainedTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

from datasets import load_dataset
#加载数据集
dataset = load_dataset(path='lansinuote/ChnSentiCorp')
#编码
f = lambda x: tokenizer(x['text'], truncation=True, max_length=500)
dataset = dataset.map(f, remove_columns=['text'])
#设置数据类型
dataset.set_format('pt')
dataset, dataset['train'][0]

Map: 100%|██████████| 9600/9600 [00:02<00:00, 4331.78 examples/s]
Map: 100%|██████████| 1200/1200 [00:00<00:00, 4513.32 examples/s]
Map: 100%|██████████| 1200/1200 [00:00<00:00, 4540.61 examples/s]

(DatasetDict({
     train: Dataset({
         features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
         num_rows: 9600
     })
     validation: Dataset({
         features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
         num_rows: 1200
     })
     test: Dataset({
         features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
         num_rows: 1200
     })
 }),
 {'label': tensor(1),
  'input_ids': tensor([ 101, 6848, 2885, 4403, 3736, 5709, 1736, 4638, 1333, 1728, 2218, 3221,
          3175,  912, 8024, 3300, 4510, 1220, 2820, 3461, 4684, 2970, 1168, 6809,
          3862, 6804, 8024, 1453, 1741, 7623, 7667,  510, 7608, 2443,  510, 1555,
          1767,  510, 6631, 2356,  510, 3033,  855,  671, 2418,  936, 1059,  511,
          6983, 2421, 6163,  934,  671, 5663, 8024,  852, 6820, 5050, 3146, 3815,
           511, 3807, 3737, 1762, 1920, 1828, 4638, 2238, 7553, 8024, 1728, 3634,
          2523, 2207, 8024,  679, 6814, 1957, 1036,  948, 3221, 1599, 3614,  511,
          1259, 4638, 3193, 7623, 3221, 6205, 2466, 4638, 8024, 6820, 5050,  705,
          2168,  511, 3302, 1218, 1408, 8024,  671, 5663,  102]),
  'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1])})

#定义模型
from transformers import BertConfig, BertForSequenceClassification
#在线加载一个语句分类模型
model = BertForSequenceClassification.from_pretrained(
    'google-bert/bert-base-chinese', num_labels=2)
model.config

Some weights of the model checkpoint at google-bert/bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

BertConfig {
  "_name_or_path": "google-bert/bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.14.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
#配置训练参数
args = TrainingArguments(output_dir='output_dir',
                         num_train_epochs=1,
                         max_steps=300,
                         per_device_train_batch_size=8)
#创建trainer
trainer = Trainer(model=model,
                  args=args,
                  train_dataset=dataset['train'],
                  data_collator=DataCollatorWithPadding(tokenizer))
trainer.train()

max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 9600
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 300


Training completed. Do not forget to share your model on huggingface.co/models =)

TrainOutput(global_step=300, training_loss=0.4373234558105469, metrics={'train_runtime': 22.3157, 'train_samples_per_second': 107.548, 'train_steps_per_second': 13.443, 'total_flos': 303243713522880.0, 'train_loss': 0.4373234558105469, 'epoch': 0.25})

#执行测试
def test():
    device = torch.device("cpu")
    model.to(device)
    loader_test = torch.utils.data.DataLoader(
        dataset['test'],
        batch_size=8,
        shuffle=True,
        drop_last=True,
        collate_fn=DataCollatorWithPadding(tokenizer))
    correct = 0
    total = 0
    for i, data in enumerate(loader_test):
        with torch.no_grad():
            out = model(**data).logits
        out = out.argmax(dim=1)
        correct += (out == data.labels).sum().item()
        total += len(data.labels)
        print(i, len(loader_test), correct / total)
        if i == 5:
            break
    return correct / total
test()

0 150 1.0
1 150 0.875
2 150 0.875
3 150 0.875
4 150 0.875
5 150 0.8958333333333334

0.8958333333333334

特点	Spark NLP Hub	Hugging Face
生态系统	依托 Spark，适合大数据分布式处理	以 Transformers 为核心，支持多种深度学习框架
主要任务	NLP（分词、命名实体识别、情感分析等）	NLP、CV、语音等多领域任务
模型数量	数百个，专注于生产级 NLP	数千个，涵盖多种模型（BERT、GPT、T5等）
数据集支持	支持部分公开数据集，集成 Spark 数据管道	Hugging Face Datasets，海量公开数据集
部署方式	适合企业级、分布式部署，支持 Spark 集群	支持本地、云端、API、微服务等多种部署方式
易用性	需一定 Spark 基础，API 设计偏工程化	API 简单，社区活跃，文档丰富
预训练模型	主要为生产环境优化，支持多语言	预训练模型丰富，支持多语言和多任务
商业支持	John Snow Labs 提供商业支持	Hugging Face 提供企业服务和社区支持
典型应用场景	大规模文本处理、企业级 NLP 流水线	学术研究、原型开发、工业应用、AI 产品

大数据分析与挖掘¶

11. Hugging Face¶

Spark NLP Hub 与 Hugging Face 对比¶

Hugging Face 核心组成¶

词编码工具¶

数据集的基本操作¶

加载本地数据集¶

创建模型¶

分类任务¶

填空任务¶

句子关系推断任务¶

使用Trainer执行训练¶