diff --git a/swift/model/constant.py b/swift/model/constant.py index 3f5d3a4011..f90ac47379 100644 --- a/swift/model/constant.py +++ b/swift/model/constant.py @@ -154,6 +154,7 @@ class MLLMModelType: ovis2 = 'ovis2' ovis2_5 = 'ovis2_5' midashenglm = 'midashenglm' + mimo_v2 = 'mimo_v2' chatglm4v = 'chatglm4v' glm4v = 'glm4v' diff --git a/swift/model/model_arch.py b/swift/model/model_arch.py index bf115414f5..42bb7a6362 100644 --- a/swift/model/model_arch.py +++ b/swift/model/model_arch.py @@ -87,6 +87,7 @@ class MLLMModelArch: keye_vl = 'keye_vl' midashenglm = 'midashenglm' + mimo_v2 = 'mimo_v2' step_audio2_mini = 'step_audio2_mini' hunyuan_vl = 'hunyuan_vl' step3_vl = 'step3_vl' @@ -787,6 +788,14 @@ def register_model_arch(model_arch: ModelKeys, *, exist_ok: bool = False) -> Non vision_tower='model.visual', )) +register_model_arch( + MultiModelKeys( + MLLMModelArch.mimo_v2, + language_model=['model', 'lm_head'], + aligner='visual.merger', + vision_tower=['visual', 'audio_encoder', 'speech_embeddings'], + )) + def get_model_arch(arch_name: Optional[str]) -> Optional[MultiModelKeys]: return MODEL_ARCH_MAPPING.get(arch_name) diff --git a/swift/model/models/qwen.py b/swift/model/models/qwen.py index 4a6d30f43b..d725556239 100644 --- a/swift/model/models/qwen.py +++ b/swift/model/models/qwen.py @@ -867,6 +867,28 @@ def get_model(self, model_dir: str, *args, **kwargs) -> PreTrainedModel: tags=['vision', 'video'])) +class MiMoV2Loader(Qwen2VLLoader): + + def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrainedModel: + model = ModelLoader.get_model(self, model_dir, config, processor, model_kwargs) + patch_get_input_embeddings(model.visual, 'patch_embed') + return model + + +register_model( + ModelMeta( + MLLMModelType.mimo_v2, [ + ModelGroup([ + Model('XiaomiMiMo/MiMo-V2.5', 'XiaomiMiMo/MiMo-V2.5'), + ], TemplateType.mimo_v2), + ], + MiMoV2Loader, + model_arch=ModelArch.mimo_v2, + architectures=['MiMoV2ForCausalLM'], + requires=['transformers>=4.49', 'qwen_vl_utils>=0.0.6', 'decord'], + tags=['vision', 'video'])) + + def patch_Qwen3VLMoeTextExperts_dtype(): from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts if hasattr(Qwen3VLMoeTextExperts, '_patch'): diff --git a/swift/template/constant.py b/swift/template/constant.py index d6998b9073..c926b0d3a5 100644 --- a/swift/template/constant.py +++ b/swift/template/constant.py @@ -150,6 +150,7 @@ class MLLMTemplateType: ovis2 = 'ovis2' ovis2_5 = 'ovis2_5' mimo_vl = 'mimo_vl' + mimo_v2 = 'mimo_v2' midashenglm = 'midashenglm' llama3_1_omni = 'llama3_1_omni' diff --git a/swift/template/templates/qwen.py b/swift/template/templates/qwen.py index de0ac6c80e..0a0f6cfcf9 100644 --- a/swift/template/templates/qwen.py +++ b/swift/template/templates/qwen.py @@ -505,6 +505,50 @@ class Qwen2_5VLTemplate(Qwen2VLTemplate): default_system='You are MiMo, an AI assistant developed by Xiaomi.')) +class MiMoV2Template(Qwen2_5VLTemplate): + """Template for XiaomiMiMo/MiMo-V2.5. + + Differences from Qwen2_5VLTemplate: + - MiMo-V2.5 does not use 3D rope position IDs (no get_rope_index). + - Video key is named 'video_pixel_values' instead of 'pixel_values_videos'. + - Supports thinking mode with ... tags. + """ + + def _get_position_ids(self, inputs: Dict[str, Any]): + # MiMo-V2.5 uses standard rotary position embeddings, + # not 3D rope like Qwen2VL. No special position IDs needed. + return {} + + def forward_context(self, model, inputs): + # Skip Qwen2VL-specific flash attention patching + return Template.forward_context(self, model, inputs) + + def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]: + if not self.is_training: + # During inference, rename key to match MiMo-V2.5 forward signature + if 'pixel_values_videos' in inputs: + inputs['video_pixel_values'] = inputs.pop('pixel_values_videos') + return inputs + # For training, compute embeddings manually + input_ids = inputs['input_ids'] + base_model = self.get_base_model(model) + inputs_embeds = base_model.model.embed_tokens(input_ids) + inputs_embeds = self._get_inputs_embeds_hf(inputs_embeds, inputs, model.visual, self.processor, model.config) + return {'inputs_embeds': inputs_embeds} + + +register_template( + QwenTemplateMeta( + MLLMTemplateType.mimo_v2, + template_cls=MiMoV2Template, + default_system='You are MiMo, a helpful AI assistant engineered by Xiaomi.', + is_thinking=True, + thinking_prefix='\n', + non_thinking_prefix='\n\n\n', + history_thinking_prefix='\n\n\n', + )) + + class Qwen3VLTemplate(Qwen2VLTemplate): version = 'v3'