## 十、Hugging Face 简介

<div class="alert alert-warning">
<b>注意：</b> 今天的代码，都不要在Jupyter笔记上直接运行，会死机！！请下载左边的脚本，在自己的环境里运行。
</div>

### 10.1、Hugging Face 是什么

- 官网：http://www.huggingface.co
- 相当于面向 NLP 模型的 Github
- 尤其基于 transformer 的开源模型非常全
- 封装了模型、数据集、训练器等，使模型的下载、使用、训练都非常方便

### 10.2、Hugging Face 安装


```python
# pip安装
pip install transformers # 安装最新的版本
pip install transformers == 4.30 # 安装指定版本
# conda安装
conda install -c huggingface transformers  # 只4.0以后的版本
```

### 10.3、加载一个预训练的模型


```python

# 运行: python load-and-run.py

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
import torch

# model_name_or_path="baichuan-inc/baichuan-7B"
model_name_or_path = "distilgpt2"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path, trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path, trust_remote_code=True
).to(device)

inputs = tokenizer('I am', return_tensors='pt').to(device)

pred = model.generate(**inputs, max_new_tokens=10)

output = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)

print(output)
```

### 10.4、基于 Huggingface 训练/微调一个模型

- 定义你的 Tokenizer
- 定义你的模型结构：如果是已知模型结构，可以通过 Config 指定其超参（层数，头数，维度等）
- 定义数据集加载器：分别加载 train、validation、test 数据集
- 训练你的 Tokenizer **（可选）**：如果你完全从 0 开始训练一个模型，这步是必须的
- 定义一个数据处理函数：Tokenizer 一般用在这里，把原始数据处理成满足模型输入的 Tensor 形式
- 定义 TrainingArguments：模型训练的各种超参在这里指定
- 定义一个 Trainer
- 定义 Evaluation Metrics **（可选）**：如果你希望观察除 loss 外的测试指标


```python

# 运行: python fine-tune-mrpc.py

import datasets
from datasets import load_dataset
from datasets import load_metric
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import transformers
from transformers import DataCollatorWithPadding
from sklearn.metrics import f1_score
import torch
import numpy as np
import os
import torch.nn as nn

SEED=42

# ALBERT是一种压缩过的BERT
MODEL_NAME = "albert-base-v2"
DATASET_NAME = "glue" # 一组NLP评测任务
DATASET_TASK = "mrpc" # MRPC 是其中一个子任务 -- Microsoft Research Paraphrase Corpus

# 在Bert的基础上加了一个线性分类器
class MyClassifier(torch.nn.Module):
    def __init__(self, backbone):
        super().__init__()
        self.bert_encoder = backbone
        self.linear = torch.nn.Linear(768, 2)

    def compute_loss(self, logits, labels):
        loss_fct = nn.CrossEntropyLoss()
        return loss_fct(logits, labels)

    def forward(self, input_ids, attention_mask,labels=None):
        output = self.bert_encoder(input_ids=input_ids, attention_mask=attention_mask)
        output = output.last_hidden_state[:, 0, :]
        output = self.linear(output)
        if labels is not None:
            loss = self.compute_loss(output, labels)
            return loss, output
        return output

# 加载数据集对应的评估方法
glue_metric = datasets.load_metric(DATASET_NAME, DATASET_TASK)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return glue_metric.compute(predictions=predictions, references=labels)

# 加载数据集
raw_datasets = load_dataset(DATASET_NAME,DATASET_TASK)

# 训练集
raw_train_dataset = raw_datasets["train"]
# 验证集
raw_valid_dataset = raw_datasets["validation"]

columns = raw_train_dataset.column_names

# 设置随机种子
transformers.set_seed(SEED)

# 定义tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 定义数据处理函数，把原始数据转成input_ids, attention_mask, labels
def process_fn(examples):
    inputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=128)
    examples["input_ids"] = inputs["input_ids"]
    examples["attention_mask"] = inputs["attention_mask"]
    examples["labels"] = examples["label"]
    return examples



tokenized_train_dataset = raw_train_dataset.map(
    process_fn,
    batched=True,
    remove_columns=columns
)

tokenized_valid_dataset = raw_valid_dataset.map(
    process_fn,
    batched=True,
    remove_columns=columns
)


# 定义数据校准器（自动生成batch）
collater = DataCollatorWithPadding(
    tokenizer=tokenizer, return_tensors="pt",
)

# 定义模型 -- 其实Transformer可以直接用AutoModelForSequenceClassification
#model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# 我手工写了分类器层，为了方便大家理解什么叫在Transformer上面做分类任务
backbone = AutoModel.from_pretrained(MODEL_NAME)
model = MyClassifier(backbone)

# 定义训练参数
training_args = TrainingArguments(
    output_dir="./output",        # checkpoint保存路径
    evaluation_strategy="steps",    # 每N步做一次eval
    overwrite_output_dir=True,
    num_train_epochs=1,             # 训练epoch数
    per_device_train_batch_size=8,  # 每张卡的batch大小
    gradient_accumulation_steps=4,   # 累加几个step做一次参数更新
    per_device_eval_batch_size=8,  # evaluation batch size
    logging_steps=20,             # 每20步eval一次
    save_steps=20,                # 每20步保存一个checkpoint
    learning_rate=2e-5,             # 学习率
    warmup_ratio=0.1,               # 预热（可选）
)

# 定义训练器
trainer = Trainer(
    model=model, # 待训练模型
    args=training_args, # 训练参数
    data_collator=collater, # 数据校准器
    train_dataset=tokenized_train_dataset, # 训练集
    eval_dataset=tokenized_valid_dataset, # 验证集
    compute_metrics=compute_metrics, # 评价指标
)

# 禁用wandb（与huggingface.co同步的机制）
os.environ["WANDB_DISABLED"] = "true"

# 开始训练
trainer.train()

```

### 10.5、从头训练一个的模型（参考）

```python
import torch
import torch.distributed
from datasets import load_dataset
import os
from transformers import GPT2TokenizerFast, DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer
import multiprocessing
import random

MODEL_NAME = "gpt2"
SEED = 42
MAX_LENGTH = 1024
MIN_LENGTH = 4
VOCAB_SIZE = 50257

# 定义数据处理函数


def prepare_train_features(examples):
    # 略过空行
    examples["nonempty_text"] = [
        d.strip() for d in examples["text"] if len(d.strip()) > 0
    ]

    # Convert the tokens into ids using the trained tokenizer

    tokenized_example = tokenizer(
        examples["nonempty_text"],
        truncation=True,
        max_length=MAX_LENGTH*100,
    )

    # 模型输出的字段
    examples["input_ids"] = []
    examples["attention_mask"] = []

    del examples["text"]
    del examples["nonempty_text"]

    for input_ids, attention_mask in zip(tokenized_example["input_ids"], tokenized_example["attention_mask"]):

        trunc_ids = input_ids[:min(len(input_ids), MAX_LENGTH)]
        trunc_mask = attention_mask[:min(len(attention_mask), MAX_LENGTH)]

        # 把长句切成MAX_LENGTH长度的片段, 最后一段如果小于MIN_LENGTH则忽略
        while len(trunc_ids) > MIN_LENGTH:
            trunc_len = len(trunc_ids)
            if trunc_len < MAX_LENGTH:
                examples["input_ids"].append(
                    trunc_ids+[tokenizer.pad_token_id]*(MAX_LENGTH-trunc_len))
                examples["attention_mask"].append(
                    trunc_mask+[0]*(MAX_LENGTH-trunc_len))
            else:
                examples["input_ids"].append(trunc_ids)
                examples["attention_mask"].append(trunc_mask)

            input_ids = input_ids[trunc_len:]
            attention_mask = attention_mask[trunc_len:]

            trunc_ids = input_ids[:min(len(input_ids), MAX_LENGTH)]
            trunc_mask = attention_mask[:min(len(attention_mask), MAX_LENGTH)]

    examples['labels'] = examples['input_ids'].copy()

    return examples


# 开启多开训练模式
torch.distributed.init_process_group(
    backend='nccl', init_method="env://", rank=args.local_rank, world_size=args.word_size)
torch.cuda.set_device(args.local_rank)

# 自动下载openwebtext数据集，展开前几十GB，展开成arrow格式大约500G
raw_datasets = load_dataset("openwebtext", split="train")
# 这里只用1%的数据作为测试集，否则每次dev时间很长
raw_datasets = raw_datasets.train_test_split(test_size=0.01)

raw_train_dataset = raw_datasets["train"]
raw_valid_dataset = raw_datasets["test"]

transformers.set_seed(args.seed)

# 定义tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
# 获取CPU核数（用于数据加载线程数）
num_proc = multiprocessing.cpu_count()

if torch.distributed.get_rank() > 0:
    # 主进程加载数据，其它进程等待从缓存加载arrow文件"
    torch.distributed.barrier()

tokenized_train_dataset = raw_train_dataset.map(
    prepare_train_features,
    batched=True,
    num_proc=num_proc
)

tokenized_valid_dataset = raw_valid_dataset.map(
    prepare_train_features,
    batched=True,
    num_proc=num_proc
)

if torch.distributed.get_rank() == 0:
    # 主进程加载数据结束
    torch.distributed.barrier()

# 定义数据校准器（自动生成batch）
collater = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, return_tensors="pt"
)

# 定义GPT-2模型config
model_config = GPT2Config(vocab_size=VOCAB_SIZE,
                          max_position_embeddings=MAX_LENGTH, return_dict=True)
# 定义模型（此处参数随机初始化）
model = GPT2LMHeadModel(config=model_config)

training_args = TrainingArguments(
    output_dir="./my_model",        # checkpoint保存路径
    evaluation_strategy="steps",    # 每N步做一次eval
    overwrite_output_dir=True,
    num_train_epochs=1,             # 训练epoch数
    per_device_train_batch_size=8,  # 每张卡的batch大小
    gradient_accumulation_steps=20,   # 累加几个step做一次参数更新
    per_device_eval_batch_size=16,  # evaluation batch size
    logging_steps=1000,             # 每1000步eval一次
    save_steps=1000,                # 每1000步保存一个checkpoint
    learning_rate=1e-3,             # 学习率
    warmup_steps=2000,              # 预热（可选）
    optim="adamw_hf",               # 求解器（默认）
)

# 定义训练器
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collater,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
)

os.environ["WANDB_DISABLED"] = "true"

# 开始训练
trainer.train()
```

### 10.6、从头训练一个 Tokenizer（参考）


```python
from tqdm import tqdm
from transformers import GPT2TokenizerFast
from datasets import load_dataset
import os

VOCAB_SIZE = 50257
MAX_LENGTH = 1024
SEED = 42
MODEL_NAME = "gpt-2"

raw_datasets = load_dataset("openwebtext", split="train")
raw_datasets = raw_datasets.train_test_split(test_size=0.01)
raw_train_dataset = raw_datasets["train"]
raw_valid_dataset = raw_datasets["test"]

# 定义一个批量加载数据的迭代器


def batch_iterator(batch_size=10000):
    for i in tqdm(range(0, len(raw_train_dataset), batch_size)):
        yield raw_train_dataset[i: i + batch_size]["text"]


# 加载预训练的tokenizer（为了复用其定义的特殊token）
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

gpt_tokenizer = tokenizer.train_new_from_iterator(
    text_iterator=batch_iterator(), vocab_size=VOCAB_SIZE)
gpt_tokenizer.save_pretrained("my-tokenizer-"+MODEL_NAME)
```

<div class="alert alert-success">
<b>划重点：</b> 参数微调过程与上述Training过程基本是一致的。
</div>

- 定义微调数据集加载器
- 定义数据处理函数
- 加载预训练模型：AutoModel.from_pretrained(MODEL_NAME_OR_PATH)
- 在预训练模型上增加任务相关输出层 **（如果需要）**
- 加载预训练 Tokenizer：AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
- 定义各种超参
- 定义 Trainer
- 定义 Evaluation Metric
- 开始训练


[继续](../peft/index.ipynb)
