[bugfix] Fix GKD with TRL >= 0.24 & GKD Liger (#6663)

hjh0119 · web-flow · commit 6232492411fe · 2025-11-19T23:46:33.000+08:00
diff --git a/docs/source/BestPractices/Qwen3-Best-Practice.md b/docs/source/BestPractices/Qwen3-Best-Practice.md
@@ -254,7 +254,7 @@ ms-swift 支持 DPO、GRPO、DAPO、PPO、KTO、GKD 等 RLHF 方法。本章将
 
 除了安装上述介绍的 ms-swift 相关依赖项外，还需要安装以下依赖项：
 ```
-pip install "math_verify==0.5.2"
+pip install "math_verify"
 pip install vllm==0.8.5.post1
 ```
 
diff --git a/docs/source_en/BestPractices/Qwen3-Best-Practice.md b/docs/source_en/BestPractices/Qwen3-Best-Practice.md
@@ -258,7 +258,7 @@ ms-swift supports RLHF methods such as DPO, GRPO, DAPO, PPO, KTO, and GKD. This
 In addition to installing the dependencies related to ms-swift mentioned above, you also need to install the following:
 
 ```shell
-pip install "math_verify==0.5.2"
+pip install "math_verify"
 pip install vllm==0.8.5.post1
 ```
 
diff --git a/examples/train/grpo/plugin/deepeyes/deepeyes_plugin.py b/examples/train/grpo/plugin/deepeyes/deepeyes_plugin.py
@@ -18,7 +18,7 @@
 try:
     from math_verify import parse, verify
 except ImportError as e:
-    raise ImportError('please install math_verify by `pip install math_verify==0.5.2`') from e
+    raise ImportError('please install math_verify by `pip install math_verify`') from e
 """
 3 dataset file
     1. data_v0.8_visual_toolbox_v2.parquet:  data_source == 'chart' (vl_agent.compute_score)
diff --git a/requirements/install_all.sh b/requirements/install_all.sh
@@ -9,5 +9,5 @@ pip install git+https://github.com/modelscope/ms-swift.git#egg=ms-swift[all]
 pip install timm "deepspeed<0.18" -U
 pip install qwen_vl_utils qwen_omni_utils keye_vl_utils -U
 pip install decord librosa icecream soundfile -U
-pip install liger_kernel nvitop pre-commit math_verify==0.5.2 py-spy wandb swanlab -U
+pip install liger_kernel nvitop pre-commit math_verify py-spy wandb swanlab -U
 # flash-attn: https://github.com/Dao-AILab/flash-attention/releases
diff --git a/swift/plugin/orm.py b/swift/plugin/orm.py
@@ -239,7 +239,7 @@ def __init__(self):
         import importlib.util
         assert importlib.util.find_spec('math_verify') is not None, (
             'The math_verify package is required but not installed. '
-            "Please install it using 'pip install math_verify==0.5.2'.")
+            "Please install it using 'pip install math_verify'.")
 
     def __call__(self, completions, solution, **kwargs) -> List[float]:
         from latex2sympy2_extended import NormalizationConfig
diff --git a/swift/trainers/rlhf_trainer/gkd_trainer.py b/swift/trainers/rlhf_trainer/gkd_trainer.py
@@ -10,7 +10,9 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import trl
 from accelerate.utils import gather_object, is_peft_model
+from packaging import version
 from transformers import PreTrainedModel
 from trl import GKDTrainer as HFGKDTrainer
 from trl import SFTTrainer as HFSFTTrainer
@@ -20,7 +22,8 @@
                          unwrap_model_for_generation)
 from ..mixin import SwiftMixin
 from .rollout_mixin import DataType, RolloutTrainerMixin
-from .utils import identity_data_collator, patch_profiling_context, patch_profiling_decorator, prepare_deepspeed
+from .utils import (get_gather_if_zero3_context, identity_data_collator, patch_profiling_context,
+                    patch_profiling_decorator, prepare_deepspeed)
 
 try:
     from liger_kernel.chunked_loss import LigerFusedLinearJSDLoss
@@ -61,10 +64,12 @@ def __init__(self, model: Optional[Union[PreTrainedModel, nn.Module, str]] = Non
         self._prepare_liger_loss()
 
         self.teacher_ds3_gather_for_generation = args.ds3_gather_for_generation
+        self.is_teacher_ds3 = None
         # Initialize teacher model
         if self.is_deepspeed_enabled:
             if teacher_deepspeed_config is not None:
-                if teacher_deepspeed_config.get('zero_optimization', {}).get('stage') != 3:
+                self.is_teacher_ds3 = teacher_deepspeed_config.get('zero_optimization', {}).get('stage') == 3
+                if not self.is_teacher_ds3:
                     self.teacher_ds3_gather_for_generation = False
                 self.teacher_model = prepare_deepspeed(
                     teacher_model, self.accelerator, deepspeed_config=teacher_deepspeed_config, training_args=args)
@@ -88,6 +93,7 @@ def __init__(self, model: Optional[Union[PreTrainedModel, nn.Module, str]] = Non
             self.maybe_activation_offload_context = get_act_offloading_ctx_manager(model=self.model)
         else:
             self.maybe_activation_offload_context = nullcontext()
+        self._trl_version_gte_0_24 = version.parse(trl.__version__) >= version.parse('0.24')
 
     # Code borrowed from huggingface/trl
     def generate_on_policy_outputs(self, model, inputs, generation_config, pad_token_id=None):
@@ -131,7 +137,7 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
         model_inputs = {k: v for k, v in inputs.items() if k not in {'prompt', 'labels'}}
         # If generate is used, then use_logits_to_keep must be set to False.
         use_logits_to_keep = self.get_use_logits_to_keep(True)
-        if use_logits_to_keep:
+        if use_logits_to_keep and not self.use_liger_gkd_loss:
             self.prepare_logits_to_keep(inputs)
             model_inputs['logits_to_keep'] = inputs['logits_to_keep']
 
@@ -176,17 +182,24 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
                 student_head = unwrapped_student.get_output_embeddings()
                 teacher_head = unwrapped_teacher.get_output_embeddings()
 
-                # Compute liger fused JSD loss
-                loss = self.liger_jsd_loss(
-                    student_input=student_hidden,
-                    student_weight=student_head.weight,
-                    teacher_input=teacher_hidden,
-                    teacher_weight=teacher_head.weight,
-                    true_labels=true_labels,
-                    student_bias=getattr(student_head, 'bias', None),
-                    teacher_bias=getattr(teacher_head, 'bias', None),
-                )
-
+                # Prepare context managers for gathering parameters in zero3
+                teacher_context = get_gather_if_zero3_context(self, is_zero3=self.is_teacher_ds3)(teacher_head.weight)
+                student_context = get_gather_if_zero3_context(self)(student_head.weight)
+
+                with teacher_context, student_context:
+                    # Compute liger fused JSD loss
+                    loss = self.liger_jsd_loss(
+                        student_input=student_hidden,
+                        student_weight=student_head.weight,
+                        teacher_input=teacher_hidden,
+                        teacher_weight=teacher_head.weight,
+                        true_labels=true_labels,
+                        student_bias=getattr(student_head, 'bias', None),
+                        teacher_bias=getattr(teacher_head, 'bias', None),
+                    )
+                    # loss / grad norm is unexpectedly large, normalize by sequence length
+                    # https://github.com/linkedin/Liger-Kernel/blob/v0.6.3/src/liger_kernel/chunked_loss/jsd_loss.py#L9-L39
+                    loss /= student_hidden.shape[1]
                 # Release hidden states after loss computation
                 del student_hidden, teacher_hidden, true_labels
         else:
@@ -222,7 +235,8 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
                 teacher_logits=shifted_teacher_logits,
                 beta=self.beta,
             )
-
+            if self._trl_version_gte_0_24:
+                loss /= shifted_student_logits.shape[1]
             # Add SFT loss if enabled (common for both paths)
             if self.args.sft_alpha > 0:
                 loss = loss + self.args.sft_alpha * outputs_student.loss
diff --git a/swift/trainers/rlhf_trainer/utils.py b/swift/trainers/rlhf_trainer/utils.py
@@ -605,9 +605,11 @@ def patched_len(self) -> int:
         RepeatSampler.old_len_func = origin_len_func
 
 
-def get_gather_if_zero3_context(trainer):
+def get_gather_if_zero3_context(trainer, is_zero3: Optional[bool] = None):
     deepspeed_plugin = trainer.accelerator.state.deepspeed_plugin
-    zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3
+    zero_stage_3 = is_zero3 if is_zero3 is not None else (deepspeed_plugin is not None
+                                                          and deepspeed_plugin.zero_stage == 3)
+
     if zero_stage_3:
         import deepspeed
         gather_if_zero3 = deepspeed.zero.GatheredParameters