diff --git a/swift/model/constant.py b/swift/model/constant.py
index 3f5d3a4011..f90ac47379 100644
--- a/swift/model/constant.py
+++ b/swift/model/constant.py
@@ -154,6 +154,7 @@ class MLLMModelType:
ovis2 = 'ovis2'
ovis2_5 = 'ovis2_5'
midashenglm = 'midashenglm'
+ mimo_v2 = 'mimo_v2'
chatglm4v = 'chatglm4v'
glm4v = 'glm4v'
diff --git a/swift/model/model_arch.py b/swift/model/model_arch.py
index bf115414f5..42bb7a6362 100644
--- a/swift/model/model_arch.py
+++ b/swift/model/model_arch.py
@@ -87,6 +87,7 @@ class MLLMModelArch:
keye_vl = 'keye_vl'
midashenglm = 'midashenglm'
+ mimo_v2 = 'mimo_v2'
step_audio2_mini = 'step_audio2_mini'
hunyuan_vl = 'hunyuan_vl'
step3_vl = 'step3_vl'
@@ -787,6 +788,14 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non
vision_tower='model.visual',
))
+register_model_arch(
+ MultiModelKeys(
+ MLLMModelArch.mimo_v2,
+ language_model=['model', 'lm_head'],
+ aligner='visual.merger',
+ vision_tower=['visual', 'audio_encoder', 'speech_embeddings'],
+ ))
+
def get_model_arch(arch_name: Optional[str]) -> Optional[MultiModelKeys]:
return MODEL_ARCH_MAPPING.get(arch_name)
diff --git a/swift/model/models/qwen.py b/swift/model/models/qwen.py
index 4a6d30f43b..d725556239 100644
--- a/swift/model/models/qwen.py
+++ b/swift/model/models/qwen.py
@@ -867,6 +867,28 @@ def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel:
tags=['vision', 'video']))
+class MiMoV2Loader(Qwen2VLLoader):
+
+ def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrainedModel:
+ model = ModelLoader.get_model(self, model_dir, config, processor, model_kwargs)
+ patch_get_input_embeddings(model.visual, 'patch_embed')
+ return model
+
+
+register_model(
+ ModelMeta(
+ MLLMModelType.mimo_v2, [
+ ModelGroup([
+ Model('XiaomiMiMo/MiMo-V2.5', 'XiaomiMiMo/MiMo-V2.5'),
+ ], TemplateType.mimo_v2),
+ ],
+ MiMoV2Loader,
+ model_arch=ModelArch.mimo_v2,
+ architectures=['MiMoV2ForCausalLM'],
+ requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'],
+ tags=['vision', 'video']))
+
+
def patch_Qwen3VLMoeTextExperts_dtype():
from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
if hasattr(Qwen3VLMoeTextExperts, '_patch'):
diff --git a/swift/template/constant.py b/swift/template/constant.py
index d6998b9073..c926b0d3a5 100644
--- a/swift/template/constant.py
+++ b/swift/template/constant.py
@@ -150,6 +150,7 @@ class MLLMTemplateType:
ovis2 = 'ovis2'
ovis2_5 = 'ovis2_5'
mimo_vl = 'mimo_vl'
+ mimo_v2 = 'mimo_v2'
midashenglm = 'midashenglm'
llama3_1_omni = 'llama3_1_omni'
diff --git a/swift/template/templates/qwen.py b/swift/template/templates/qwen.py
index de0ac6c80e..0a0f6cfcf9 100644
--- a/swift/template/templates/qwen.py
+++ b/swift/template/templates/qwen.py
@@ -505,6 +505,50 @@ class Qwen2_5VLTemplate(Qwen2VLTemplate):
default_system='You are MiMo, an AI assistant developed by Xiaomi.'))
+class MiMoV2Template(Qwen2_5VLTemplate):
+ """Template for XiaomiMiMo/MiMo-V2.5.
+
+ Differences from Qwen2_5VLTemplate:
+ - MiMo-V2.5 does not use 3D rope position IDs (no get_rope_index).
+ - Video key is named 'video_pixel_values' instead of 'pixel_values_videos'.
+ - Supports thinking mode with ... tags.
+ """
+
+ def _get_position_ids(self, inputs: Dict[str, Any]):
+ # MiMo-V2.5 uses standard rotary position embeddings,
+ # not 3D rope like Qwen2VL. No special position IDs needed.
+ return {}
+
+ def forward_context(self, model, inputs):
+ # Skip Qwen2VL-specific flash attention patching
+ return Template.forward_context(self, model, inputs)
+
+ def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
+ if not self.is_training:
+ # During inference, rename key to match MiMo-V2.5 forward signature
+ if 'pixel_values_videos' in inputs:
+ inputs['video_pixel_values'] = inputs.pop('pixel_values_videos')
+ return inputs
+ # For training, compute embeddings manually
+ input_ids = inputs['input_ids']
+ base_model = self.get_base_model(model)
+ inputs_embeds = base_model.model.embed_tokens(input_ids)
+ inputs_embeds = self._get_inputs_embeds_hf(inputs_embeds, inputs, model.visual, self.processor, model.config)
+ return {'inputs_embeds': inputs_embeds}
+
+
+register_template(
+ QwenTemplateMeta(
+ MLLMTemplateType.mimo_v2,
+ template_cls=MiMoV2Template,
+ default_system='You are MiMo, a helpful AI assistant engineered by Xiaomi.',
+ is_thinking=True,
+ thinking_prefix='\n',
+ non_thinking_prefix='\n\n\n',
+ history_thinking_prefix='\n\n\n',
+ ))
+
+
class Qwen3VLTemplate(Qwen2VLTemplate):
version = 'v3'