短文本匹配任務:4B模型微調
1. 引言:短文本匹配的挑戰與機遇
短文本匹配是自然語言處理中的核心任務,廣泛應用于搜索引擎、問答系統、推薦系統和智能客服等場景。與長文本相比,短文本面臨著語境信息有限、語義密度高、表達多樣性等獨特挑戰。
近年來,隨著大語言模型的快速發展,4B參數規模的模型在效果和資源消耗之間提供了良好的平衡點。本文將從理論到實踐,全面介紹4B模型在短文本匹配任務上的微調方法,涵蓋核心概念、多種技術方案、完整代碼實現以及兩大主流框架的詳細使用方法。
2. 核心概念與技術基礎
2.1 短文本匹配的任務定義
短文本匹配旨在判斷兩個簡短文本之間的語義相關性,通常表現為:
- 二分類問題(相關/不相關)
- 相似度評分(0-1連續值)
- 排序問題(相對相關性)
2.2 微調的核心價值
預訓練模型雖然具備強大的語言理解能力,但在特定領域的短文本匹配任務中表現往往不佳。微調的核心價值在于:
- 領域適應:讓模型學習特定領域的術語和表達方式
- 任務對齊:將模型的通用能力聚焦到特定任務上
- 性能提升:在有限數據下獲得超越零樣本學習的表現
2.3 參數高效微調技術
對于4B模型,全參數微調成本高昂,參數高效微調成為首選:
LoRA原理:
# LoRA的數學表達
h = W?x + ΔWx
ΔW = BA # 低秩分解,B ∈ ?^(d×r), A ∈ ?^(r×k)其中r ? min(d,k),大幅減少可訓練參數。
QLoRA進階:
- 引入4-bit量化,進一步降低顯存占用
- 保持FP16的精度效果
- 支持在單張消費級GPU上微調大模型
3. 環境準備與數據準備
3.1 完整環境配置
# requirements.txt
torch==2.1.0
torchvision==0.16.0
torchaudio==2.1.0
transformers==4.37.0
datasets==2.14.0
accelerate==0.25.0
peft==0.8.0
bitsandbytes==0.41.0
trl==0.7.0
evaluate==0.4.0
sentencepiece==0.1.99
protobuf==3.20.3
llama-factory==0.7.0
ms-swift==1.8.0
modelscope==1.11.0
pandas==2.0.3
numpy==1.24.3
tqdm==4.66.1
tensorboard==2.14.1
scikit-learn==1.3.0
jieba==0.42.13.2 數據準備
import json
import pandas as pd
import random
from datasets import Dataset, DatasetDict
from typing import List, Dict, Any
class TextMatchingDataProcessor:
def __init__(self):
self.positive_templates = [
("今天天氣很好", "陽光明媚的一天", 1),
("我喜歡吃蘋果", "蘋果是我最喜歡的水果", 1),
("學習編程很難", "編程學習有挑戰性", 1),
("這部電影很棒", "這部影片非常精彩", 1),
("他跑步很快", "他的跑步速度很快", 1),
("這個餐廳很好吃", "這家餐館味道不錯", 1),
("手機電量不足", "電池快沒電了", 1),
("工作很忙碌", "事務繁多", 1),
("價格很便宜", "性價比高", 1),
("交通很擁堵", "路上很堵車", 1)
]
self.negative_templates = [
("今天天氣很好", "計算機編程很有趣", 0),
("我喜歡吃蘋果", "汽車需要加油", 0),
("學習編程很難", "天空是藍色的", 0),
("這部電影很棒", "數學公式很復雜", 0),
("他跑步很快", "這本書很厚", 0),
("這個餐廳很好吃", "電腦運行很慢", 0),
("手機電量不足", "房價在上漲", 0),
("工作很忙碌", "海水是咸的", 0),
("價格很便宜", "地球是圓的", 0),
("交通很擁堵", "鳥兒會飛翔", 0)
]
def generate_synthetic_data(self, num_samples: int = 2000) -> List[Dict[str, Any]]:
"""生成合成短文本匹配數據"""
data = []
# 生成正樣本
for _ in range(num_samples // 2):
template = random.choice(self.positive_templates)
text1, text2, label = template
# 添加文本變化
variations = [
(f"我覺得{text1}", f"我也認為{text2}", label),
(f"不得不說{text1}", f"確實如此,{text2}", label),
(text1, f"總而言之,{text2}", label),
(f"眾所周知,{text1}", f"{text2},這是事實", label),
(f"從某種程度上說,{text1}", f"可以這樣理解:{text2}", label)
]
varied_template = random.choice(variations + [template])
data.append({
"text1": varied_template[0],
"text2": varied_template[1],
"label": varied_template[2]
})
# 生成負樣本
for _ in range(num_samples // 2):
template = random.choice(self.negative_templates)
text1, text2, label = template
variations = [
(f"我認為{text1}", f"但是{text2}", label),
(f"一般來說{text1}", f"然而{text2}", label),
(text1, f"另一方面,{text2}", label),
(f"盡管{text1}", f"可是{text2}", label),
(f"雖然{text1}", f"但{text2}", label)
]
varied_template = random.choice(variations + [template])
data.append({
"text1": varied_template[0],
"text2": varied_template[1],
"label": varied_template[2]
})
return data
def create_instruction_dataset(self, data: List[Dict]) -> Dataset:
"""創建指令格式的數據集"""
formatted_data = []
for item in data:
text1 = item["text1"]
text2 = item["text2"]
label = item["label"]
if label == 1:
answer = "相關"
explanation = "這兩個文本在語義上高度相關,表達了相同或相似的意思。"
else:
answer = "不相關"
explanation = "這兩個文本在語義上沒有明顯關聯,討論的是不同的話題。"
# 多種指令模板
templates = [
f"判斷以下兩個文本是否語義相關:\n文本1:{text1}\n文本2:{text2}\n答案:{answer}",
f"請分析這兩個文本的相關性:\n'{text1}'\n'{text2}'\n它們是否相關?{answer}",
f"文本匹配任務:\n輸入1:{text1}\n輸入2:{text2}\n匹配結果:{answer}",
f"基于以下信息判斷文本相關性:\n第一個文本:{text1}\n第二個文本:{text2}\n推理:{explanation}\n結論:{answer}"
]
prompt = random.choice(templates)
formatted_data.append({
"instruction": "判斷文本語義相關性",
"input": f"文本1:{text1}\n文本2:{text2}",
"output": answer,
"text": prompt,
"text1": text1,
"text2": text2,
"label": label
})
return Dataset.from_list(formatted_data)
def save_dataset(self, dataset: Dataset, output_path: str):
"""保存數據集"""
dataset.to_json(output_path, orient="records", lines=True)
df = pd.DataFrame(dataset)
df.to_csv(output_path.replace(".json", ".csv"), index=False, encoding="utf-8")
def load_dataset(self, data_path: str) -> DatasetDict:
"""加載和分割數據集"""
dataset = Dataset.from_json(data_path)
# 分割數據集
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_val_split = train_test_split["train"].train_test_split(test_size=0.1, seed=42)
return DatasetDict({
"train": train_val_split["train"],
"validation": train_val_split["test"],
"test": train_test_split["test"]
})
# 數據增強類
import jieba
from synonyms import nearby
class DataAugmentor:
def __init__(self):
self.augmentation_methods = [
self.synonym_replacement,
self.random_insertion,
self.random_swap,
]
def synonym_replacement(self, text: str, replace_ratio: float = 0.3) -> str:
"""同義詞替換"""
words = list(jieba.cut(text))
n_replace = max(1, int(len(words) * replace_ratio))
indices = random.sample(range(len(words)), min(n_replace, len(words)))
for idx in indices:
word = words[idx]
if len(word) > 1:
try:
syns = nearby(word)
if syns and len(syns[0]) > 0:
synonym = random.choice(syns[0])
words[idx] = synonym
except:
continue
return "".join(words)
def random_insertion(self, text: str, insert_ratio: float = 0.3) -> str:
"""隨機插入詞語"""
words = list(jieba.cut(text))
n_insert = max(1, int(len(words) * insert_ratio))
for _ in range(n_insert):
if len(words) > 0:
random_word = random.choice(words)
random_idx = random.randint(0, len(words))
words.insert(random_idx, random_word)
return "".join(words)
def random_swap(self, text: str, swap_ratio: float = 0.3) -> str:
"""隨機交換詞語"""
words = list(jieba.cut(text))
n_swap = max(1, int(len(words) * swap_ratio))
for _ in range(n_swap):
if len(words) > 1:
idx1, idx2 = random.sample(range(len(words)), 2)
words[idx1], words[idx2] = words[idx2], words[idx1]
return "".join(words)
def augment_dataset(self, dataset: Dataset, augmentation_factor: int = 2) -> Dataset:
"""增強整個數據集"""
augmented_data = []
for item in dataset:
# 保持原始數據
augmented_data.append(item)
# 添加增強數據
for _ in range(augmentation_factor - 1):
aug_text1 = random.choice(self.augmentation_methods)(item["text1"])
aug_text2 = random.choice(self.augmentation_methods)(item["text2"])
augmented_item = item.copy()
augmented_item["text1"] = aug_text1
augmented_item["text2"] = aug_text2
augmented_item["text"] = f"判斷以下兩個文本是否語義相關:\n文本1:{aug_text1}\n文本2:{aug_text2}\n答案:{item['output']}"
augmented_data.append(augmented_item)
return Dataset.from_list(augmented_data)
# 使用示例
if __name__ == "__main__":
processor = TextMatchingDataProcessor()
# 生成數據
synthetic_data = processor.generate_synthetic_data(2000)
# 創建指令數據集
dataset = processor.create_instruction_dataset(synthetic_data)
# 數據增強
augmentor = DataAugmentor()
augmented_dataset = augmentor.augment_dataset(dataset)
print(f"原始數據大小: {len(dataset)}")
print(f"增強后數據大小: {len(augmented_dataset)}")
# 保存數據
processor.save_dataset(augmented_dataset, "text_matching_data.json")
# 加載和分割數據
dataset_dict = processor.load_dataset("text_matching_data.json")
print(f"訓練集大小: {len(dataset_dict['train'])}")
print(f"驗證集大小: {len(dataset_dict['validation'])}")
print(f"測試集大小: {len(dataset_dict['test'])}")4. 多方案對比與選擇指南
4.1 基礎對比學習方案
適用場景:標注數據充足,追求最佳性能
技術棧:
- 模型:Qwen2-4B
- 微調方法:LoRA
- 損失函數:對比損失(Contrastive Loss)
- 評估指標:Accuracy, F1-score, AUC
4.2 指令微調+思維鏈方案
適用場景:需要模型提供推理過程,可解釋性要求高
技術棧:
- 模型:Llama-3-4B-Instruct
- 微調方法:QLoRA
- 提示工程:思維鏈提示
- 評估指標:人工評估,推理準確性
4.3 多任務聯合學習方案
適用場景:數據來源多樣,需要模型具備多重能力
技術棧:
- 模型:Phi-4
- 微調方法:LoRA + 適配器
- 任務組合:文本匹配 + 文本分類 + 關鍵詞提取
- 評估指標:各任務獨立評估
4.4 檢索增強生成方案
適用場景:需要結合外部知識,處理專業領域文本
技術棧:
- 模型:Gemma-4B
- 微調方法:LoRA
- 檢索系統:FAISS + 向量數據庫
- 評估指標:檢索準確率,匹配精度
5. 使用Hugging Face Transformers微調
5.1 完整的訓練腳本
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
BitsAndBytesConfig,
get_linear_schedule_with_warmup
)
from peft import (
LoraConfig,
get_peft_model,
prepare_model_for_kbit_training,
TaskType
)
from datasets import DatasetDict
import numpy as np
from tqdm import tqdm
import os
import json
class TextMatchingTrainer:
def __init__(self, model_name: str = "Qwen/Qwen2.5-4B"):
self.model_name = model_name
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.setup_model()
def setup_model(self):
"""設置模型和分詞器"""
# 4-bit量化配置
self.bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
# 加載模型
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=self.bnb_config,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16
)
# 加載分詞器
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name,
trust_remote_code=True
)
# 設置pad_token
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# 準備模型用于k-bit訓練
self.model = prepare_model_for_kbit_training(self.model)
def setup_lora(self, lora_r: int = 16, lora_alpha: int = 32, lora_dropout: float = 0.05):
"""設置LoRA配置"""
self.lora_config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_dropout=lora_dropout,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
# 應用LoRA
self.model = get_peft_model(self.model, self.lora_config)
# 打印可訓練參數
trainable_params = 0
all_param = 0
for _, param in self.model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(f"可訓練參數: {trainable_params} || 總參數: {all_param} || 可訓練比例: {100 * trainable_params / all_param:.2f}%")
def tokenize_function(self, examples):
"""分詞函數"""
# 構建輸入文本
texts = []
for instruction, input_text, output in zip(
examples["instruction"], examples["input"], examples["output"]
):
if input_text.strip():
text = f"### Instruction:\n{instruction}\n### Input:\n{input_text}\n### Response:\n{output}"
else:
text = f"### Instruction:\n{instruction}\n### Response:\n{output}"
texts.append(text)
# 分詞
tokenized = self.tokenizer(
texts,
truncation=True,
padding=False,
max_length=512,
return_tensors=None
)
# 設置標簽(因果語言建模)
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
def setup_training_args(self, output_dir: str = "./output"):
"""設置訓練參數"""
self.training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=5,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=4,
eval_accumulation_steps=4,
learning_rate=2e-4,
warmup_steps=100,
logging_steps=50,
eval_steps=200,
save_steps=500,
save_total_limit=3,
load_best_model_at_end=True,
evaluation_strategy="steps",
metric_for_best_model="eval_loss",
greater_is_better=False,
fp16=True,
dataloader_pin_memory=False,
remove_unused_columns=False,
report_to=["tensorboard"],
ddp_find_unused_parameters=False,
)
def compute_metrics(self, eval_pred):
"""計算評估指標"""
predictions, labels = eval_pred
# 這里可以添加自定義的評估指標
# 對于文本生成任務,主要看loss
predictions = torch.tensor(predictions)
labels = torch.tensor(labels)
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(predictions.view(-1, predictions.size(-1)), labels.view(-1))
return {"eval_loss": loss.item()}
def create_trainer(self, train_dataset, eval_dataset):
"""創建Trainer"""
data_collator = DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False,
)
self.trainer = Trainer(
model=self.model,
args=self.training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
tokenizer=self.tokenizer,
compute_metrics=self.compute_metrics,
)
return self.trainer
def train(self, dataset_dict: DatasetDict, output_dir: str = "./output"):
"""執行訓練"""
# 分詞數據集
tokenized_datasets = dataset_dict.map(
self.tokenize_function,
batched=True,
remove_columns=dataset_dict["train"].column_names
)
# 設置訓練參數
self.setup_training_args(output_dir)
# 創建trainer
trainer = self.create_trainer(
tokenized_datasets["train"],
tokenized_datasets["validation"]
)
# 開始訓練
print("開始訓練...")
train_result = trainer.train()
# 保存最終模型
trainer.save_model()
trainer.save_state()
# 保存訓練指標
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
print(f"訓練完成! 模型保存在: {output_dir}")
return trainer
# 使用示例
if __name__ == "__main__":
# 加載數據
processor = TextMatchingDataProcessor()
dataset_dict = processor.load_dataset("text_matching_data.json")
# 初始化訓練器
trainer = TextMatchingTrainer("Qwen/Qwen2.5-4B")
# 設置LoRA
trainer.setup_lora(lora_r=16, lora_alpha=32)
# 開始訓練
trained_trainer = trainer.train(dataset_dict, "./text_matching_output")5.2 高級訓練技巧
class AdvancedTrainingTechniques:
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def dynamic_batching(self, dataset, max_length=512, batch_size=4):
"""動態批次處理"""
def collate_fn(batch):
# 按長度排序以便更高效的填充
batch = sorted(batch, key=lambda x: len(x["input_ids"]), reverse=True)
input_ids = [item["input_ids"] for item in batch]
attention_mask = [item["attention_mask"] for item in batch]
labels = [item["labels"] for item in batch]
# 填充
input_ids = torch.nn.utils.rnn.pad_sequence(
[torch.tensor(ids) for ids in input_ids],
batch_first=True,
padding_value=self.tokenizer.pad_token_id
)
attention_mask = torch.nn.utils.rnn.pad_sequence(
[torch.tensor(mask) for mask in attention_mask],
batch_first=True,
padding_value=0
)
labels = torch.nn.utils.rnn.pad_sequence(
[torch.tensor(label) for label in labels],
batch_first=True,
padding_value=-100
)
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels
}
return collate_fn
def gradient_accumulation_scheduler(self, current_step, total_steps):
"""動態梯度累積調度"""
# 隨著訓練進行逐漸增加梯度累積步數
if current_step < total_steps * 0.3:
return 2
elif current_step < total_steps * 0.6:
return 4
else:
return 8
def learning_rate_scheduler(self, current_step, total_steps, initial_lr=2e-4):
"""自定義學習率調度"""
# 余弦退火 + 熱重啟
warmup_steps = int(total_steps * 0.1)
if current_step < warmup_steps:
# 線性warmup
return initial_lr * (current_step / warmup_steps)
else:
# 余弦退火
progress = (current_step - warmup_steps) / (total_steps - warmup_steps)
return initial_lr * 0.5 * (1 + np.cos(np.pi * progress))
class ContrastiveLossTrainer(TextMatchingTrainer):
"""使用對比損失的訓練器"""
def __init__(self, model_name: str = "Qwen/Qwen2.5-4B"):
super().__init__(model_name)
self.contrastive_loss = nn.CosineEmbeddingLoss()
def get_text_embeddings(self, input_ids, attention_mask):
"""獲取文本嵌入"""
with torch.no_grad():
outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=True
)
# 使用最后一層隱藏狀態的均值作為文本嵌入
last_hidden_state = outputs.hidden_states[-1]
embeddings = last_hidden_state.mean(dim=1)
return embeddings
def compute_contrastive_loss(self, batch):
"""計算對比損失"""
# 假設batch包含正負樣本對
anchor_ids = batch["anchor_input_ids"]
positive_ids = batch["positive_input_ids"]
negative_ids = batch["negative_input_ids"]
anchor_mask = batch["anchor_attention_mask"]
positive_mask = batch["positive_attention_mask"]
negative_mask = batch["negative_attention_mask"]
# 獲取嵌入
anchor_emb = self.get_text_embeddings(anchor_ids, anchor_mask)
positive_emb = self.get_text_embeddings(positive_ids, positive_mask)
negative_emb = self.get_text_embeddings(negative_ids, negative_mask)
# 計算對比損失
pos_loss = self.contrastive_loss(anchor_emb, positive_emb, torch.ones(anchor_emb.size(0)))
neg_loss = self.contrastive_loss(anchor_emb, negative_emb, -torch.ones(anchor_emb.size(0)))
return pos_loss + neg_loss
# 使用高級訓練技巧
def setup_advanced_training():
"""設置高級訓練"""
# 加載基礎訓練器
base_trainer = TextMatchingTrainer("Qwen/Qwen2.5-4B")
base_trainer.setup_lora()
# 設置高級技巧
advanced_tech = AdvancedTrainingTechniques(base_trainer.model, base_trainer.tokenizer)
return base_trainer, advanced_tech6. 使用LLaMA Factory微調
6.1 完整配置和訓練腳本
import os
import yaml
from datetime import datetime
class LLaMAFactoryConfig:
def __init__(self, model_name: str = "Qwen/Qwen2.5-4B"):
self.model_name = model_name
self.config = self.create_default_config()
def create_default_config(self) -> dict:
"""創建默認配置"""
return {
"model_name_or_path": self.model_name,
"dataset": "text_matching_dataset",
"output_dir": f"./llama_factory_output/{datetime.now().strftime('%Y%m%d_%H%M%S')}",
# 訓練配置
"num_train_epochs": 5,
"max_samples": 100000,
"learning_rate": 2e-4,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.1,
# 批次配置
"per_device_train_batch_size": 4,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 4,
# LoRA配置
"finetuning_type": "lora",
"lora_rank": 16,
"lora_alpha": 32,
"lora_dropout": 0.1,
"lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
# 量化配置
"quantization_bit": 4,
"bnb_4bit_compute_dtype": "bfloat16",
"bnb_4bit_use_double_quant": True,
"bnb_4bit_quant_type": "nf4",
# 其他配置
"fp16": True,
"logging_steps": 50,
"save_steps": 500,
"eval_steps": 200,
"save_total_limit": 3,
"evaluation_strategy": "steps",
"load_best_model_at_end": True,
"metric_for_best_model": "loss",
"greater_is_better": False,
# 模板配置
"template": "qwen",
"cutoff_len": 512,
"overwrite_cache": True,
}
def update_config(self, **kwargs):
"""更新配置"""
self.config.update(kwargs)
def save_config(self, config_path: str = "llama_factory_config.yaml"):
"""保存配置到文件"""
os.makedirs(os.path.dirname(config_path), exist_ok=True)
with open(config_path, 'w', encoding='utf-8') as f:
yaml.dump(self.config, f, allow_unicode=True, default_flow_style=False)
print(f"配置已保存到: {config_path}")
def create_dataset_config(self, dataset_path: str, dataset_name: str = "text_matching_dataset"):
"""創建數據集配置"""
dataset_config = {
dataset_name: {
"file_name": dataset_path,
"formatting": "instruction"
}
}
dataset_config_path = "dataset_info.yaml"
with open(dataset_config_path, 'w', encoding='utf-8') as f:
yaml.dump(dataset_config, f, allow_unicode=True, default_flow_style=False)
return dataset_config_path
def setup_llama_factory_environment():
"""設置LLaMA Factory環境"""
# 創建必要的目錄
os.makedirs("./llama_factory_output", exist_ok=True)
os.makedirs("./data", exist_ok=True)
# 檢查LLaMA Factory是否安裝
try:
import llama_factory
print("LLaMA Factory 已安裝")
except ImportError:
print("請先安裝 LLaMA Factory: pip install llama-factory")
return False
return True
def run_llama_factory_training(config_path: str, dataset_config_path: str):
"""運行LLaMA Factory訓練"""
import subprocess
import sys
# 構建訓練命令
cmd = [
sys.executable, "src/train_bash.py",
"--config", config_path,
"--dataset_config", dataset_config_path
]
print("開始訓練...")
print("命令:", " ".join(cmd))
# 執行訓練
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("訓練輸出:", result.stdout)
if result.stderr:
print("訓練錯誤:", result.stderr)
except subprocess.CalledProcessError as e:
print(f"訓練失敗: {e}")
print(f"錯誤輸出: {e.stderr}")
return False
return True
# LLaMA Factory使用示例
if __name__ == "__main__":
# 設置環境
if not setup_llama_factory_environment():
exit(1)
# 創建配置
config = LLaMAFactoryConfig("Qwen/Qwen2.5-4B")
# 更新特定配置
config.update_config(
output_dir="./llama_factory_output/text_matching",
num_train_epochs=5,
learning_rate=2e-4
)
# 保存配置
config.save_config("llama_factory_config.yaml")
# 創建數據集配置
dataset_config_path = config.create_dataset_config("text_matching_data.json")
# 運行訓練
success = run_llama_factory_training("llama_factory_config.yaml", dataset_config_path)
if success:
print("訓練完成!")
else:
print("訓練失敗!")6.2 命令行方式
6.2.1 訓練命令
llamafactory-cli train \
--model_name_or_path Qwen/Qwen2.5-4B \
--dataset text_matching_dataset \
--template qwen \
--finetuning_type lora \
--lora_target q_proj,k_proj,v_proj,o_proj \
--lora_rank 16 \
--lora_alpha 32 \
--lora_dropout 0.1 \
--output_dir ./llama_factory_output \
--overwrite_cache \
--overwrite_output_dir \
--cutoff_len 512 \
--preprocessing_num_workers 16 \
--num_train_epochs 5 \
--per_device_train_batch_size 4 \
--gradient_accumulation_steps 4 \
--lr_scheduler_type cosine \
--logging_steps 10 \
--save_steps 500 \
--learning_rate 2e-4 \
--warmup_ratio 0.1 \
--evaluation_strategy steps \
--eval_steps 200 \
--load_best_model_at_end \
--metric_for_best_model eval_loss \
--greater_is_better false \
--fp16 \
--do_train \
--do_eval6.2.2 評估命令
llamafactory-cli eval \
--model_name_or_path Qwen/Qwen2.5-4B \
--adapter_name_or_path ./llama_factory_output \
--dataset text_matching_dataset \
--template qwen \
--finetuning_type lora \
--cutoff_len 512 \
--per_device_eval_batch_size 8 \
--predict_with_generate \
--max_new_tokens 128 \
--temperature 0.1 \
--top_p 0.75 \
--do_predict7. 使用MS-SWIFT微調
7.1 完整訓練腳本
import os
import torch
from modelscope import Model, snapshot_download
from swift import Swift, LoRAConfig, Trainer
from swift import push_to_hub_cmd
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
DataCollatorForLanguageModeling
)
from datasets import DatasetDict
import evaluate
import numpy as np
class MSSWIFTTrainer:
def __init__(self, model_name: str = "Qwen/Qwen2.5-4B"):
self.model_name = model_name
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.setup_model_and_tokenizer()
def setup_model_and_tokenizer(self):
"""設置模型和分詞器"""
# 下載模型(如果不存在)
model_dir = snapshot_download(self.model_name)
# 加載分詞器
self.tokenizer = AutoTokenizer.from_pretrained(
model_dir,
trust_remote_code=True
)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# 加載模型
self.model = AutoModelForCausalLM.from_pretrained(
model_dir,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
def setup_lora(self, lora_r: int = 16, lora_alpha: int = 32, lora_dropout: float = 0.05):
"""設置LoRA配置"""
self.lora_config = LoRAConfig(
r=lora_r,
target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
lora_alpha=lora_alpha,
lora_dropout=lora_dropout
)
# 應用LoRA
self.model = Swift.prepare_model(self.model, self.lora_config)
# 打印可訓練參數
trainable_params = 0
all_param = 0
for _, param in self.model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(f"可訓練參數: {trainable_params} || 總參數: {all_param} || 可訓練比例: {100 * trainable_params / all_param:.2f}%")
def tokenize_function(self, examples):
"""分詞函數"""
texts = []
for instruction, input_text, output in zip(
examples["instruction"], examples["input"], examples["output"]
):
text = f"<|im_start|>system\n你是一個文本匹配助手。<|im_end|>\n<|im_start|>user\n{instruction}\n{input_text}<|im_end|>\n<|im_start|>assistant\n{output}<|im_end|>"
texts.append(text)
# 分詞
tokenized = self.tokenizer(
texts,
truncation=True,
padding=False,
max_length=512,
return_tensors=None
)
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
def setup_training_arguments(self, output_dir: str = "./swift_output"):
"""設置訓練參數"""
self.training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
warmup_ratio=0.1,
logging_steps=50,
eval_steps=200,
save_steps=500,
save_total_limit=3,
evaluation_strategy="steps",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
fp16=True,
remove_unused_columns=False,
report_to=["tensorboard"],
dataloader_pin_memory=False,
)
def compute_metrics(self, eval_pred):
"""計算評估指標"""
import evaluate
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=-1)
# 計算準確率
accuracy_metric = evaluate.load("accuracy")
accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
return accuracy
def create_data_collator(self):
"""創建數據收集器"""
return DataCollatorForLanguageModeling(
tokenizer=self.tokenizer,
mlm=False,
)
def train(self, dataset_dict: DatasetDict, output_dir: str = "./swift_output"):
"""執行訓練"""
# 分詞數據集
tokenized_datasets = dataset_dict.map(
self.tokenize_function,
batched=True,
remove_columns=dataset_dict["train"].column_names
)
# 設置訓練參數
self.setup_training_arguments(output_dir)
# 創建數據收集器
data_collator = self.create_data_collator()
# 創建Trainer
trainer = Trainer(
model=self.model,
args=self.training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=self.tokenizer,
data_collator=data_collator,
compute_metrics=self.compute_metrics,
)
# 開始訓練
print("開始訓練...")
train_result = trainer.train()
# 保存模型
trainer.save_model()
trainer.save_state()
# 保存指標
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
# 保存Swift適配器
output_adapter_dir = os.path.join(output_dir, "adapter")
self.model.save_pretrained(output_adapter_dir)
print(f"訓練完成! 模型保存在: {output_dir}")
print(f"適配器保存在: {output_adapter_dir}")
return trainer
# MS-SWIFT命令行訓練配置
def create_swift_cli_config():
"""創建命令行訓練配置"""
config = {
"model_type": "qwen2-4b",
"model_id_or_path": "Qwen/Qwen2.5-4B",
"dataset": "text-matching-dataset",
"output_dir": "./swift_cli_output",
"lora": True,
"lora_rank": 16,
"lora_alpha": 32,
"lora_dropout": 0.05,
"learning_rate": 2e-4,
"num_train_epochs": 5,
"max_length": 512,
"per_device_train_batch_size": 4,
"gradient_accumulation_steps": 4,
"logging_steps": 50,
"save_steps": 500,
}
# 保存配置
import json
with open("swift_config.json", "w", encoding="utf-8") as f:
json.dump(config, f, indent=2, ensure_ascii=False)
return config
# 使用示例
if __name__ == "__main__":
# 加載數據
processor = TextMatchingDataProcessor()
dataset_dict = processor.load_dataset("text_matching_data.json")
# 初始化MS-SWIFT訓練器
swift_trainer = MSSWIFTTrainer("Qwen/Qwen2.5-4B")
# 設置LoRA
swift_trainer.setup_lora(lora_r=16, lora_alpha=32)
# 開始訓練
trained_trainer = swift_trainer.train(dataset_dict, "./swift_text_matching_output")
# 創建命令行配置(可選)
create_swift_cli_config()7.2 命令行方式
7.2.1 訓練命令
CUDA_VISIBLE_DEVICES=0 swift sft \
--model_id_or_path Qwen/Qwen2.5-4B \
--dataset text-matching-dataset \
--output_dir ./swift_output \
--lora_rank 16 \
--lora_alpha 32 \
--lora_dropout 0.05 \
--lora_target_modules q_proj,k_proj,v_proj,o_proj \
--learning_rate 2e-4 \
--num_train_epochs 5 \
--max_length 512 \
--per_device_train_batch_size 4 \
--gradient_accumulation_steps 4 \
--lr_scheduler_type cosine \
--warmup_ratio 0.1 \
--logging_steps 10 \
--save_steps 500 \
--eval_steps 200 \
--save_total_limit 3 \
--evaluation_strategy steps \
--load_best_model_at_end \
--metric_for_best_model eval_loss \
--greater_is_better false \
--fp16 \
--do_train \
--do_eval7.2.2 評估命令
CUDA_VISIBLE_DEVICES=0 swift infer \
--model_id_or_path Qwen/Qwen2.5-4B \
--model_id_or_path_adapter ./swift_output \
--dataset text-matching-dataset \
--infer_backend pt \
--max_length 512 \
--max_new_tokens 128 \
--temperature 0.1 \
--top_p 0.75 \
--per_device_eval_batch_size 8 \
--do_predict \
--merge_lora false7.3 關鍵參數(LLaMA Factory 和 MS-SWIFT共用)
7.3.1 關鍵訓練參數
- ?
?--model_name_or_path??? / ??--model_id_or_path??: 基礎模型路徑 - ?
?--dataset??: 數據集名稱 - ?
?--lora_rank??: LoRA秩,控制參數數量 - ?
?--learning_rate??: 學習率,LoRA通常用1e-4到5e-4 - ?
?--num_train_epochs??: 訓練輪數 - ?
?--per_device_train_batch_size??: 批次大小 - ?
?--gradient_accumulation_steps??: 梯度累積步數
7.3.2 關鍵評估參數
- ?
?--adapter_name_or_path??? / ??--model_id_or_path_adapter??: 微調后模型路徑 - ?
?--per_device_eval_batch_size??: 評估批次大小 - ?
?--max_new_tokens??: 生成的最大token數 - ?
?--temperature??: 生成溫度 - ?
?--top_p??: 核采樣參數
7.4 LLaMA Factory 和 MS-SWIFT 對比
7.4.1 使用場景對比
場景 | LLaMA Factory | MS-SWIFT |
單GPU訓練 | 需要顯式指定 | 自動檢測使用 |
多GPU訓練 | 需要指定設備ID | 自動使用所有GPU |
指定特定GPU | ? | 同樣需要指定 |
分布式訓練 | 需要額外參數 | 自動處理 |
7.4.2 使用方式對比
對于單GPU用戶
# LLaMA Factory - 需要指定
CUDA_VISIBLE_DEVICES=0 llamafactory-cli train ...
# MS-SWIFT - 可選指定
swift sft ... # 自動使用GPU 0
# 或顯式指定
CUDA_VISIBLE_DEVICES=0 swift sft ...對于多GPU用戶
# 使用前兩個GPU
CUDA_VISIBLE_DEVICES=0,1 llamafactory-cli train ...
CUDA_VISIBLE_DEVICES=0,1 swift sft ...
# 使用所有GPU
llamafactory-cli train ... # 需要額外分布式參數
swift sft ... # 自動使用所有GPU7.4.3 參數組合示例
完整訓練 + 評估
--do_train --do_eval --evaluation_strategy steps --eval_steps 200僅訓練(不評估)
--do_train僅評估(不訓練)
--do_eval訓練 + 最終評估
--do_train --evaluation_strategy no # 訓練時不評估,但最終會評估一次8. 多任務聯合學習
8.1 多任務學習的優勢
- 知識共享:不同任務之間可以共享底層特征表示
- 正則化效應:多任務學習起到正則化作用,防止過擬合
- 數據效率:充分利用有限的標注數據
- 泛化能力:提高模型在未見數據上的泛化能力
- 統一框架:在一個模型中處理多個相關任務
8.2 完整的多任務聯合學習實現
import torch
import torch.nn as nn
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling,
BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset, DatasetDict
import numpy as np
from typing import Dict, List, Any
import json
class MultiTaskTextMatchingModel(nn.Module):
"""多任務聯合學習模型"""
def __init__(self, model_name: str = "Qwen/Qwen2.5-4B", num_classes: int = 10):
super().__init__()
# 量化配置
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
# 加載基礎模型
self.base_model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.bfloat16
)
# 準備模型用于k-bit訓練
self.base_model = prepare_model_for_kbit_training(self.base_model)
# LoRA配置
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
# 應用LoRA
self.base_model = get_peft_model(self.base_model, lora_config)
# 獲取隱藏層維度
hidden_size = self.base_model.config.hidden_size
# 多任務頭
self.matching_head = nn.Linear(hidden_size, 2) # 文本匹配:二分類
self.classification_head = nn.Linear(hidden_size, num_classes) # 文本分類
self.keyword_head = nn.Linear(hidden_size, 1) # 關鍵詞提?。夯貧w或二分類
# 任務權重
self.task_weights = {
"matching": 1.0,
"classification": 0.7,
"keyword": 0.5
}
# 損失函數
self.matching_loss = nn.CrossEntropyLoss()
self.classification_loss = nn.CrossEntropyLoss()
self.keyword_loss = nn.BCEWithLogitsLoss()
def forward(self, input_ids, attention_mask, task_type, labels=None, **kwargs):
# 基礎模型前向傳播
outputs = self.base_model(
input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=True
)
# 獲取最后一層隱藏狀態
last_hidden_state = outputs.hidden_states[-1]
# 使用[CLS] token或第一個token的表示
if hasattr(self.base_model.config, 'bos_token_id') and input_ids is not None:
# 對于因果語言模型,使用第一個token
pooled_output = last_hidden_state[:, 0, :]
else:
# 使用平均池化
pooled_output = last_hidden_state.mean(dim=1)
task_outputs = {}
total_loss = 0
# 根據任務類型計算輸出和損失
if "matching" in task_type:
matching_logits = self.matching_head(pooled_output)
task_outputs["matching"] = matching_logits
if labels is not None and "matching_labels" in labels:
matching_loss = self.matching_loss(matching_logits, labels["matching_labels"])
total_loss += self.task_weights["matching"] * matching_loss
task_outputs["matching_loss"] = matching_loss
if "classification" in task_type:
classification_logits = self.classification_head(pooled_output)
task_outputs["classification"] = classification_logits
if labels is not None and "classification_labels" in labels:
classification_loss = self.classification_loss(
classification_logits, labels["classification_labels"]
)
total_loss += self.task_weights["classification"] * classification_loss
task_outputs["classification_loss"] = classification_loss
if "keyword" in task_type:
keyword_logits = self.keyword_head(pooled_output)
task_outputs["keyword"] = keyword_logits.squeeze(-1)
if labels is not None and "keyword_labels" in labels:
keyword_loss = self.keyword_loss(
keyword_logits.squeeze(-1), labels["keyword_labels"].float()
)
total_loss += self.task_weights["keyword"] * keyword_loss
task_outputs["keyword_loss"] = keyword_loss
task_outputs["loss"] = total_loss
task_outputs["hidden_states"] = outputs.hidden_states
task_outputs["last_hidden_state"] = last_hidden_state
return task_outputs
class MultiTaskDataProcessor:
"""多任務數據處理器"""
def __init__(self):
self.task_templates = {
"matching": self.create_matching_data,
"classification": self.create_classification_data,
"keyword": self.create_keyword_data
}
def create_matching_data(self, num_samples: int = 1000) -> List[Dict]:
"""創建文本匹配數據"""
data = []
positive_pairs = [
("今天天氣很好", "陽光明媚的早晨", 1),
("我喜歡吃蘋果", "蘋果是我最喜歡的水果", 1),
("學習編程很難", "編程學習有挑戰性", 1),
]
negative_pairs = [
("今天天氣很好", "計算機編程很有趣", 0),
("我喜歡吃蘋果", "汽車需要加油", 0),
("學習編程很難", "天空是藍色的", 0),
]
for _ in range(num_samples // 2):
text1, text2, label = random.choice(positive_pairs)
data.append({
"text": f"判斷文本相關性:{text1} [SEP] {text2}",
"matching_labels": label,
"task_type": "matching"
})
for _ in range(num_samples // 2):
text1, text2, label = random.choice(negative_pairs)
data.append({
"text": f"判斷文本相關性:{text1} [SEP] {text2}",
"matching_labels": label,
"task_type": "matching"
})
return data
def create_classification_data(self, num_samples: int = 1000) -> List[Dict]:
"""創建文本分類數據"""
data = []
categories = {
"體育": ["籃球比賽", "足球運動員", "奧運會"],
"科技": ["人工智能", "編程語言", "智能手機"],
"娛樂": ["電影明星", "音樂演唱會", "電視劇"],
"政治": ["國際關系", "政府政策", "選舉"],
"經濟": ["股票市場", "經濟增長", "通貨膨脹"]
}
for category, examples in categories.items():
for _ in range(num_samples // len(categories)):
text = random.choice(examples)
label = list(categories.keys()).index(category)
data.append({
"text": f"文本分類:{text}",
"classification_labels": label,
"task_type": "classification"
})
return data
def create_keyword_data(self, num_samples: int = 1000) -> List[Dict]:
"""創建關鍵詞提取數據"""
data = []
texts_with_keywords = [
("人工智能和機器學習正在改變世界", "人工智能,機器學習"),
("今天的籃球比賽非常精彩", "籃球,比賽"),
("政府發布了新的經濟政策", "政府,經濟政策"),
("這部電影獲得了奧斯卡獎", "電影,奧斯卡獎"),
("智能手機市場持續增長", "智能手機,市場")
]
for text, keywords in texts_with_keywords:
# 簡化的關鍵詞標簽(實際應該更復雜)
keyword_label = 1 if len(keywords.split(',')) > 1 else 0
data.append({
"text": f"提取關鍵詞:{text}",
"keyword_labels": keyword_label,
"task_type": "keyword"
})
return data
def generate_multitask_dataset(self, samples_per_task: int = 1000) -> Dataset:
"""生成多任務數據集"""
all_data = []
for task_name, task_func in self.task_templates.items():
task_data = task_func(samples_per_task)
all_data.extend(task_data)
# 打亂數據
random.shuffle(all_data)
return Dataset.from_list(all_data)
class MultiTaskTrainer(Trainer):
"""多任務訓練器"""
def compute_loss(self, model, inputs, return_outputs=False):
"""
重寫compute_loss方法以支持多任務學習
"""
# 提取任務類型和標簽
task_type = inputs.pop("task_type", "matching")
labels = {}
for key in list(inputs.keys()):
if key.endswith("_labels"):
labels[key] = inputs.pop(key)
# 模型前向傳播
outputs = model(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
task_type=task_type,
labels=labels
)
loss = outputs["loss"]
return (loss, outputs) if return_outputs else loss
def setup_multitask_training():
"""設置多任務訓練"""
# 初始化模型
model = MultiTaskTextMatchingModel("Qwen/Qwen2.5-4B")
# 初始化分詞器
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-4B")
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# 數據處理器
processor = MultiTaskDataProcessor()
dataset = processor.generate_multitask_dataset(500) # 每個任務500個樣本
# 數據集分割
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_val_split = train_test_split["train"].train_test_split(test_size=0.1, seed=42)
dataset_dict = DatasetDict({
"train": train_val_split["train"],
"validation": train_val_split["test"],
"test": train_test_split["test"]
})
# 分詞函數
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
padding=False,
max_length=256,
return_tensors=None
)
# 應用分詞
tokenized_datasets = dataset_dict.map(
tokenize_function,
batched=True,
remove_columns=dataset_dict["train"].column_names
)
# 添加任務類型回數據集
def add_task_type(example, idx):
example["task_type"] = dataset_dict["train"][idx]["task_type"]
# 添加相應的標簽
for label_key in ["matching_labels", "classification_labels", "keyword_labels"]:
if label_key in dataset_dict["train"][idx]:
example[label_key] = dataset_dict["train"][idx][label_key]
return example
tokenized_datasets = tokenized_datasets.map(
add_task_type,
with_indices=True,
batched=False
)
# 訓練參數
training_args = TrainingArguments(
output_dir="./multitask_output",
num_train_epochs=5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
warmup_ratio=0.1,
logging_steps=50,
eval_steps=200,
save_steps=500,
save_total_limit=3,
evaluation_strategy="steps",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
greater_is_better=False,
fp16=True,
remove_unused_columns=False,
report_to=["tensorboard"],
)
# 數據收集器
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
)
# 創建訓練器
trainer = MultiTaskTrainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
)
return trainer, model, tokenizer, dataset_dict
def train_multitask_model():
"""訓練多任務模型"""
print("設置多任務訓練環境...")
trainer, model, tokenizer, dataset_dict = setup_multitask_training()
print("開始多任務訓練...")
train_result = trainer.train()
# 保存模型
trainer.save_model()
trainer.save_state()
# 保存訓練指標
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
print("多任務訓練完成!")
return trainer, model, tokenizer
class MultiTaskEvaluator:
"""多任務評估器"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
self.device = next(model.parameters()).device
def evaluate_matching_task(self, test_dataset):
"""評估文本匹配任務"""
correct = 0
total = 0
for item in test_dataset:
if item["task_type"] != "matching":
continue
inputs = self.tokenizer(
item["text"],
return_tensors="pt",
truncation=True,
max_length=256
).to(self.device)
with torch.no_grad():
outputs = self.model(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
task_type="matching"
)
predictions = torch.argmax(outputs["matching"], dim=-1)
true_label = item["matching_labels"]
if predictions.item() == true_label:
correct += 1
total += 1
accuracy = correct / total if total > 0 else 0
return {"matching_accuracy": accuracy, "total_samples": total}
def evaluate_classification_task(self, test_dataset):
"""評估文本分類任務"""
correct = 0
total = 0
for item in test_dataset:
if item["task_type"] != "classification":
continue
inputs = self.tokenizer(
item["text"],
return_tensors="pt",
truncation=True,
max_length=256
).to(self.device)
with torch.no_grad():
outputs = self.model(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
task_type="classification"
)
predictions = torch.argmax(outputs["classification"], dim=-1)
true_label = item["classification_labels"]
if predictions.item() == true_label:
correct += 1
total += 1
accuracy = correct / total if total > 0 else 0
return {"classification_accuracy": accuracy, "total_samples": total}
def comprehensive_evaluation(self, test_dataset):
"""綜合評估所有任務"""
matching_results = self.evaluate_matching_task(test_dataset)
classification_results = self.evaluate_classification_task(test_dataset)
# 計算總體準確率
total_correct = (
matching_results["matching_accuracy"] * matching_results["total_samples"] +
classification_results["classification_accuracy"] * classification_results["total_samples"]
)
total_samples = matching_results["total_samples"] + classification_results["total_samples"]
overall_accuracy = total_correct / total_samples if total_samples > 0 else 0
results = {
"matching_task": matching_results,
"classification_task": classification_results,
"overall_accuracy": overall_accuracy,
"total_evaluated_samples": total_samples
}
return results
# 使用示例
if __name__ == "__main__":
import random
# 設置隨機種子
random.seed(42)
torch.manual_seed(42)
print("=== 多任務聯合學習演示 ===")
# 訓練多任務模型
trainer, model, tokenizer = train_multitask_model()
# 評估模型
print("\n開始評估多任務模型...")
_, _, _, dataset_dict = setup_multitask_training()
evaluator = MultiTaskEvaluator(model, tokenizer)
results = evaluator.comprehensive_evaluation(dataset_dict["test"])
print("\n=== 評估結果 ===")
print(f"文本匹配準確率: {results['matching_task']['matching_accuracy']:.4f}")
print(f"文本分類準確率: {results['classification_task']['classification_accuracy']:.4f}")
print(f"總體準確率: {results['overall_accuracy']:.4f}")
print(f"總評估樣本數: {results['total_evaluated_samples']}")8.3 LLaMA Factory 多任務訓練命令
llamafactory-cli train \
--model_name_or_path Qwen/Qwen2.5-4B \
--dataset multitask_dataset \
--template qwen \
--finetuning_type lora \
--lora_target q_proj,k_proj,v_proj,o_proj \
--lora_rank 16 \
--lora_alpha 32 \
--lora_dropout 0.1 \
--output_dir ./multitask_output \
--cutoff_len 512 \
--num_train_epochs 5 \
--per_device_train_batch_size 4 \
--gradient_accumulation_steps 4 \
--learning_rate 2e-4 \
--warmup_ratio 0.1 \
--evaluation_strategy steps \
--eval_steps 200 \
--load_best_model_at_end \
--metric_for_best_model eval_loss \
--greater_is_better false \
--fp16 \
--do_train \
--do_eval \
--multitask_learning \
--task_weights '{"matching": 1.0, "classification": 0.7, "keyword": 0.5}'8.4 MS-SWIFT 多任務訓練命令
CUDA_VISIBLE_DEVICES=0 swift sft \
--model_id_or_path Qwen/Qwen2.5-4B \
--dataset multitask-dataset \
--output_dir ./swift_multitask_output \
--lora_rank 16 \
--lora_alpha 32 \
--lora_dropout 0.05 \
--lora_target_modules q_proj,k_proj,v_proj,o_proj \
--learning_rate 2e-4 \
--num_train_epochs 5 \
--max_length 512 \
--per_device_train_batch_size 4 \
--gradient_accumulation_steps 4 \
--warmup_ratio 0.1 \
--logging_steps 50 \
--save_steps 500 \
--eval_steps 200 \
--save_total_limit 3 \
--evaluation_strategy steps \
--load_best_model_at_end \
--metric_for_best_model eval_loss \
--greater_is_better false \
--fp16 \
--do_train \
--do_eval \
--multitask_learning true \
--task_types matching,classification,keyword \
--task_weights 1.0,0.7,0.58.5 多任務數據集配置
# dataset_info.yaml
multitask_dataset:
file_name: multitask_data.json
formatting: multitask
tasks:
- name: matching
type: classification
num_labels: 2
- name: classification
type: classification
num_labels: 5
- name: keyword
type: sequence_labeling9. 模型評估與測試
9.1 完整評估腳本
import evaluate
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import json
import numpy as np
class TextMatchingEvaluator:
def __init__(self, model, tokenizer, device="cuda"):
self.model = model
self.tokenizer = tokenizer
self.device = device
def predict_single_pair(self, text1: str, text2: str) -> str:
"""預測單個文本對"""
prompt = f"判斷以下兩個文本是否語義相關:\n文本1:{text1}\n文本2:{text2}\n答案:"
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=10,
do_sample=False,
pad_token_id=self.tokenizer.eos_token_id
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = response.split("答案:")[-1].strip()
return "相關" if "相關" in answer else "不相關"
def evaluate_dataset(self, test_dataset, batch_size: int = 8):
"""評估整個數據集"""
predictions = []
ground_truth = []
for i in tqdm(range(0, len(test_dataset), batch_size)):
batch = test_dataset[i:i+batch_size]
for item in batch:
text1 = item["text1"]
text2 = item["text2"]
true_label = item["label"]
pred = self.predict_single_pair(text1, text2)
pred_label = 1 if "相關" in pred else 0
predictions.append(pred_label)
ground_truth.append(true_label)
# 計算指標
accuracy = accuracy_score(ground_truth, predictions)
f1 = f1_score(ground_truth, predictions)
precision = precision_score(ground_truth, predictions)
recall = recall_score(ground_truth, predictions)
metrics = {
"accuracy": accuracy,
"f1_score": f1,
"precision": precision,
"recall": recall
}
# 詳細分類報告
report = classification_report(ground_truth, predictions, output_dict=True)
return metrics, report, predictions
def evaluate_hard_cases(self, hard_cases_dataset):
"""評估困難案例"""
hard_metrics, hard_report, _ = self.evaluate_dataset(hard_cases_dataset)
return {
"hard_cases_accuracy": hard_metrics["accuracy"],
"hard_cases_f1": hard_metrics["f1_score"],
"detailed_report": hard_report
}
def create_evaluation_report(self, test_dataset, hard_cases_dataset=None, output_path: str = "evaluation_report.json"):
"""創建完整評估報告"""
print("開始基礎評估...")
base_metrics, base_report, predictions = self.evaluate_dataset(test_dataset)
report = {
"base_metrics": base_metrics,
"base_detailed_report": base_report,
"evaluation_time": str(np.datetime64('now')),
"model_info": str(self.model.config)
}
# 評估困難案例
if hard_cases_dataset:
print("開始困難案例評估...")
hard_metrics = self.evaluate_hard_cases(hard_cases_dataset)
report["hard_cases_metrics"] = hard_metrics
# 保存報告
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(report, f, indent=2, ensure_ascii=False)
print(f"評估報告已保存到: {output_path}")
return report, predictions
# 使用示例
def run_complete_evaluation(model_path: str, test_data_path: str):
"""運行完整評估"""
from transformers import AutoModelForCausalLM, AutoTokenizer
# 加載模型和分詞器
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
# 加載測試數據
processor = TextMatchingDataProcessor()
test_dataset = processor.load_dataset(test_data_path)["test"]
# 創建評估器
evaluator = TextMatchingEvaluator(model, tokenizer)
# 運行評估
report, predictions = evaluator.create_evaluation_report(
test_dataset,
output_path="model_evaluation_report.json"
)
print("評估完成!")
print(f"準確率: {report['base_metrics']['accuracy']:.4f}")
print(f"F1分數: {report['base_metrics']['f1_score']:.4f}")
return report, predictions
# 運行評估
if __name__ == "__main__":
report, predictions = run_complete_evaluation(
"./text_matching_output",
"text_matching_data.json"
)10. 總結
- 數據質量優先:高質量的訓練數據是模型效果的基石,合理的數據增強可以顯著提升模型泛化能力
- 技術方案匹配:根據具體場景選擇合適的技術方案:
基礎對比學習:適合標注數據充足的場景
指令微調+思維鏈:需要可解釋性的場景
多任務學習:數據來源多樣的場景
檢索增強生成:需要外部知識的專業領域
- 框架靈活選擇:
- LLaMA Factory:適合快速上手和可視化操作
- MS-SWIFT:適合深度定制和編程控制
- Hugging Face Transformers:提供最大的靈活性
- 評估體系完善:建立多維度評估機制確保模型質量,包括自動指標和人工評估?
本文轉載自?????鴻煊的學習筆記?????,作者:乘風破浪jxj

















