本专栏所有程序均经过测试可成功执行本文给大家带来的教程是将YOLO26的主干网络替换为CAS_ViT来提取特征。文章在介绍主要的原理后将手把手教学如何进行模块的代码添加和修改并将修改后的完整代码放在文章的最后方便大家一键运行小白也可轻松上手实践。以帮助您更好地学习深度学习目标检测YOLO系列的挑战。专栏地址YOLO26改进-论文涨点——点击跳转看所有内容关注不迷路目录1.论文2. CAS_ViT代码实现2.1 将CAS_ViT添加到YOLO26中2.2 更改init.py文件2.3 添加yaml文件2.4 在task.py中进行注册2.5 执行程序3. 完整代码分享4. GFLOPs5. 进阶6.总结1.论文论文地址CAS-ViT: Convolutional Additive Self-attention Vision Transformers for Efficient Mobile Applications官方代码官方代码仓库点击即可跳转2.CAS_ViT代码实现2.1 将CAS_ViT添加到YOLO26中关键步骤一在ultralytics\ultralytics\nn\modules下面新建文件夹models在文件夹下新建CAS_ViT.py粘贴下面代码# # Code for CAS-ViT, modified to align with SwinTransformer usage and incorporate width_list. # import torch import torch.nn as nn # from torch.cuda.amp import autocast # autocast is not used in the provided snippet import numpy as np # from einops import rearrange, repeat # Not used in the final model structure directly # import itertools # Not used import os import copy from timm.models.layers import DropPath, trunc_normal_, to_2tuple from timm.models.registry import register_model # def stem(in_chs, out_chs): return nn.Sequential( nn.Conv2d(in_chs, out_chs // 2, kernel_size3, stride2, padding1), nn.BatchNorm2d(out_chs // 2), nn.ReLU(), nn.Conv2d(out_chs // 2, out_chs, kernel_size3, stride2, padding1), nn.BatchNorm2d(out_chs), nn.ReLU(), ) class Embedding(nn.Module): Patch Embedding that is implemented by a layer of conv. Input: tensor in shape [B, C, H, W] Output: tensor in shape [B, C, H/stride, W/stride] def __init__(self, patch_size16, stride16, padding0, in_chans3, embed_dim768, norm_layernn.BatchNorm2d): super().__init__() patch_size to_2tuple(patch_size) stride to_2tuple(stride) padding to_2tuple(padding) self.proj nn.Conv2d(in_chans, embed_dim, kernel_sizepatch_size, stridestride, paddingpadding) self.norm norm_layer(embed_dim) if norm_layer else nn.Identity() def forward(self, x): x self.proj(x) x self.norm(x) return x class Mlp(nn.Module): def __init__(self, in_features, hidden_featuresNone, out_featuresNone, act_layernn.GELU, drop0.): super().__init__() out_features out_features or in_features hidden_features hidden_features or in_features self.fc1 nn.Conv2d(in_features, hidden_features, 1) self.act act_layer() self.fc2 nn.Conv2d(hidden_features, out_features, 1) self.drop nn.Dropout(drop) def forward(self, x): x self.fc1(x) x self.act(x) x self.drop(x) x self.fc2(x) x self.drop(x) return x class SpatialOperation(nn.Module): def __init__(self, dim): super().__init__() self.block nn.Sequential( nn.Conv2d(dim, dim, 3, 1, 1, groupsdim), nn.BatchNorm2d(dim), nn.ReLU(True), nn.Conv2d(dim, 1, 1, 1, 0, biasFalse), nn.Sigmoid(), ) def forward(self, x): return x * self.block(x) class ChannelOperation(nn.Module): def __init__(self, dim): super().__init__() self.block nn.Sequential( nn.AdaptiveAvgPool2d((1, 1)), nn.Conv2d(dim, dim, 1, 1, 0, biasFalse), nn.Sigmoid(), ) def forward(self, x): return x * self.block(x) class LocalIntegration(nn.Module): def __init__(self, dim, ratio1, act_layernn.ReLU, norm_layer_mlpnn.GELU): # Renamed norm_layer to norm_layer_mlp to avoid clash super().__init__() mid_dim round(ratio * dim) # Original RCViT used GELU for norm here, but AdditiveBlock passes BatchNorm. # Lets make it flexible or stick to one. Given AdditiveBlock uses BatchNorm for its norms, # using BatchNorm here might be more consistent if norm_layer_mlp becomes BatchNorm2d. # However, the original code used norm_layernn.GELU in AdditiveBlock for LocalIntegrations norm. # This is confusing. Lets assume norm_layer_mlp is the activation *inside* the conv block. # And the actual norm is nn.BatchNorm2d. # The original code in AdditiveBlock passes norm_layernn.BatchNorm2d, which would be used by self.norm1, self.norm2. # But LocalIntegrations norm_layer argument was nn.GELU. This is likely an error or misunderstanding in my original interpretation. # Lets stick to what was passed: act_layer for activation, norm_layer_mlp for the norm *within* LocalIntegration # If AdditiveBlock sets norm_layernn.BatchNorm2d, then norm_layer_mlp here will be BatchNorm2d. # Re-evaluating: The AdditiveBlock passes norm_layernn.BatchNorm2d to the Stage, # which then passes it to AdditiveBlock. Inside AdditiveBlock, # self.local_perception LocalIntegration(..., norm_layernorm_layer) # So, norm_layer_mlp in LocalIntegration IS nn.BatchNorm2d. # The act_layer is nn.ReLU. self.network nn.Sequential( nn.Conv2d(dim, mid_dim, 1, 1, 0), norm_layer_mlp(mid_dim), # This will be nn.BatchNorm2d(mid_dim) nn.Conv2d(mid_dim, mid_dim, 3, 1, 1, groupsmid_dim), act_layer(), # This will be nn.ReLU() nn.Conv2d(mid_dim, dim, 1, 1, 0), ) def forward(self, x): return self.network(x) class AdditiveTokenMixer(nn.Module): def __init__(self, dim512, attn_biasFalse, proj_drop0.): super().__init__() self.qkv nn.Conv2d(dim, 3 * dim, 1, stride1, padding0, biasattn_bias) self.oper_q nn.Sequential( SpatialOperation(dim), ChannelOperation(dim), ) self.oper_k nn.Sequential( SpatialOperation(dim), ChannelOperation(dim), ) self.dwc nn.Conv2d(dim, dim, 3, 1, 1, groupsdim) self.proj nn.Conv2d(dim, dim, 3, 1, 1, groupsdim) self.proj_drop nn.Dropout(proj_drop) def forward(self, x): q, k, v self.qkv(x).chunk(3, dim1) q self.oper_q(q) k self.oper_k(k) out self.proj(self.dwc(q k) * v) out self.proj_drop(out) return out class AdditiveBlock(nn.Module): def __init__(self, dim, mlp_ratio4., attn_biasFalse, drop0., drop_path0., act_layernn.ReLU, norm_layernn.BatchNorm2d): # Changed default norm_layer to BatchNorm2d super().__init__() # norm_layer for LocalIntegration was originally nn.GELU in its definition, # but here its passed what AdditiveBlock receives (e.g. BatchNorm2d). # Lets assume LocalIntegrations internal norm should be norm_layer (e.g. BatchNorm2d) # and its activation should be act_layer (e.g. ReLU). self.local_perception LocalIntegration(dim, ratio1, act_layeract_layer, norm_layer_mlpnorm_layer) self.norm1 norm_layer(dim) self.attn AdditiveTokenMixer(dim, attn_biasattn_bias, proj_dropdrop) self.drop_path DropPath(drop_path) if drop_path 0. else nn.Identity() self.norm2 norm_layer(dim) mlp_hidden_dim int(dim * mlp_ratio) self.mlp Mlp(in_featuresdim, hidden_featuresmlp_hidden_dim, act_layeract_layer, dropdrop) # Original Mlp uses GELU by default def forward(self, x): x x self.local_perception(x) x x self.drop_path(self.attn(self.norm1(x))) x x self.drop_path(self.mlp(self.norm2(x))) return x def Stage(dim, index, layers, mlp_ratio4., act_layernn.GELU, norm_layernn.BatchNorm2d, attn_biasFalse, drop0., drop_path_rate0.): blocks [] for block_idx in range(layers[index]): block_dpr drop_path_rate * (block_idx sum(layers[:index])) / (sum(layers) - 1) blocks.append( AdditiveBlock( dim, mlp_ratiomlp_ratio, attn_biasattn_bias, dropdrop, drop_pathblock_dpr, act_layeract_layer, norm_layernorm_layer) # Pass norm_layer here ) blocks nn.Sequential(*blocks) return blocks class RCViT(nn.Module): def __init__(self, layers, embed_dims, mlp_ratios4, downsamples[True, True, True, True], norm_layernn.BatchNorm2d, attn_biasFalse, act_layernn.GELU, num_classes1000, drop_rate0., drop_path_rate0., fork_featFalse, distillationTrue, pretrainedNone, dummy_input_size(224,224), **kwargs): # Added dummy_input_size super().__init__() self.fork_feat fork_feat self.num_classes num_classes # Keep for classification mode self.distillation distillation # Keep for classification mode self.patch_embed stem(3, embed_dims[0]) network [] for i in range(len(layers)): stage Stage(embed_dims[i], i, layers, mlp_ratiomlp_ratios if isinstance(mlp_ratios, (int, float)) else mlp_ratios[i], act_layeract_layer, norm_layernorm_layer, attn_biasattn_bias, dropdrop_rate, drop_path_ratedrop_path_rate) network.append(stage) if i len(layers) - 1: break if downsamples[i] or embed_dims[i] ! embed_dims[i 1]: network.append( Embedding( patch_size3, stride2, padding1, in_chansembed_dims[i], embed_dimembed_dims[i1], norm_layernorm_layer) # Use passed norm_layer ) self.network nn.ModuleList(network) if self.fork_feat: # These indices should point to the output of a Stage block in self.network # Stage 0: self.network[0] # Stage 1: self.network[2] # Stage 2: self.network[4] # Stage 3: self.network[6] self.out_indices [0, 2, 4, 6] # Corresponds to the output of each of the 4 stages for i_emb, i_layer_in_network in enumerate(self.out_indices): # We need to ensure embed_dims[i_emb] matches the output dim of network[i_layer_in_network] # embed_dims are [dim_stage0, dim_stage1, dim_stage2, dim_stage3] # network[0] (Stage 0) outputs embed_dims[0] # network[2] (Stage 1) outputs embed_dims[1] # network[4] (Stage 2) outputs embed_dims[2] # network[6] (Stage 3) outputs embed_dims[3] current_embed_dim embed_dims[i_emb] if i_emb 0 and os.environ.get(FORK_LAST3, None): # This seems like a specific experimental setup layer nn.Identity() else: layer norm_layer(current_embed_dim) layer_name fnorm{i_layer_in_network} # Use network index for clarity self.add_module(layer_name, layer) # Calculate width_list for feature extraction mode try: dummy_h, dummy_w to_2tuple(dummy_input_size) dummy_input torch.randn(1, 3, dummy_h, dummy_w) # Store current training state and set to eval for dummy pass original_training_state self.training self.eval() with torch.no_grad(): features self.forward(dummy_input) # self.forward will use self.fork_feat self.width_list [f.size(1) for f in features] self.train(original_training_state) # Restore original training state except Exception as e: print(fRCViT Warning: Could not compute width_list during init: {e}) self.width_list [] # Fallback else: # Classifier head self.norm norm_layer(embed_dims[-1]) self.head nn.Linear( embed_dims[-1], num_classes) if num_classes 0 \ else nn.Identity() if self.distillation: self.dist_head nn.Linear( embed_dims[-1], num_classes) if num_classes 0 \ else nn.Identity() self.apply(self.cls_init_weights) # Initialize classifier weights self.width_list [] # Not typically needed for classification mode directly # Simplified weight initialization / loading if pretrained: self.load_pretrained(pretrained) def load_pretrained(self, pretrained_path): if os.path.exists(pretrained_path): print(fLoading pretrained weights from {pretrained_path}) checkpoint torch.load(pretrained_path, map_locationcpu) state_dict_key model if model in checkpoint else state_dict if state_dict in checkpoint else if state_dict_key: state_dict checkpoint[state_dict_key] else: # Assume the checkpoint is the state_dict itself state_dict checkpoint # Filter out unnecessary keys (e.g., classifier head if fork_featTrue) if self.fork_feat: # Remove classifier specific weights if we are in fork_feat mode # and the checkpoint contains them. for k in list(state_dict.keys()): if k.startswith(head.) or k.startswith(norm.): # final norm before head if not hasattr(self, k.split(.)[0]): # if self doesnt have head or norm (final one) print(f Ignoring {k} from pretrained checkpoint for fork_featTrue mode.) del state_dict[k] # Adjust for distillation head if necessary if not self.distillation and self.fork_featFalse: for k in list(state_dict.keys()): if k.startswith(dist_head.): print(f Ignoring {k} from pretrained checkpoint as distillation is False.) del state_dict[k] msg self.load_state_dict(state_dict, strictFalse) print(f Pretrained weights loaded with message: {msg}) else: print(fRCViT Warning: Pretrained path {pretrained_path} does not exist.) # init for classification def cls_init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) def init_weights(self, pretrainedNone): # Original, for mmdet. Kept for reference but simplified. # This method was complex and tied to mmdetections logger and _load_checkpoint. # Well use a simpler load_pretrained for now. pass def forward_tokens(self, x): outs [] for idx, block_module in enumerate(self.network): # Changed block to block_module to avoid clash x block_module(x) if self.fork_feat and idx in self.out_indices: norm_layer_module getattr(self, fnorm{idx}) x_out norm_layer_module(x) outs.append(x_out) if self.fork_feat: # outs should be a list of 4 feature maps return outs # This is a list return x # This is the final feature map for classification path def forward(self, x): x self.patch_embed(x) x_features self.forward_tokens(x) # This returns a list if fork_feat, else a tensor if self.fork_feat: # output features of four stages for dense prediction return x_features # x_features is already a list of tensors # Classification path (fork_feat is False) # Here x_features is the single tensor output from forward_tokens x_final self.norm(x_features) if hasattr(self, dist_head) and self.distillation: # Check if dist_head exists # When distillationTrue, head and dist_head are present cls_out_main self.head(x_final.flatten(2).mean(-1)) cls_out_dist self.dist_head(x_final.flatten(2).mean(-1)) if not self.training: # Average outputs during inference cls_out (cls_out_main cls_out_dist) / 2 else: # Return both during training cls_out cls_out_main, cls_out_dist # This is a TUPLE else: cls_out self.head(x_final.flatten(2).mean(-1)) # This is a TENSOR # To consistently return a list from forward to potentially aid ultralytics, # even in classification mode, we could wrap cls_out. # However, standard classification models return a tensor/tuple. # The key is that when used as a backbone (fork_featTrue), it returns a list. return cls_out # # Helper function for loading weights, similar to SwinTransformers def update_weight(model_dict, weight_dict): idx, temp_dict 0, {} for k, v in weight_dict.items(): if k in model_dict.keys() and np.shape(model_dict[k]) np.shape(v): temp_dict[k] v idx 1 model_dict.update(temp_dict) print(fLoading weights... {idx}/{len(model_dict)} items loaded successfully.) return model_dict # New factory functions, similar to SwinTransformer register_model def RCViT_XS(weights, pretrained_strictFalse, **kwargs): # Added pretrained_strict model RCViT( layers[2, 2, 4, 2], embed_dims[48, 56, 112, 220], mlp_ratios4, downsamples[True, True, True, True], norm_layernn.BatchNorm2d, attn_biasFalse, act_layernn.GELU, fork_featTrue, # Default to True for backbone usage **kwargs) if weights: # Using the simpler load_pretrained method inside RCViT model.load_pretrained(weights) # Or, if you prefer the Swin-style external loading: # state_dict torch.load(weights)[model] # Adjust key if necessary # model.load_state_dict(update_weight(model.state_dict(), state_dict), strictpretrained_strict) return model register_model def RCViT_S(weights, pretrained_strictFalse, **kwargs): model RCViT( layers[3, 3, 6, 3], embed_dims[48, 64, 128, 256], mlp_ratios4, downsamples[True, True, True, True], norm_layernn.BatchNorm2d, attn_biasFalse, act_layernn.GELU, fork_featTrue, **kwargs) if weights: model.load_pretrained(weights) return model register_model def RCViT_M(weights, pretrained_strictFalse, **kwargs): model RCViT( layers[3, 3, 6, 3], embed_dims[64, 96, 192, 384], mlp_ratios4, downsamples[True, True, True, True], norm_layernn.BatchNorm2d, attn_biasFalse, act_layernn.GELU, fork_featTrue, **kwargs) if weights: model.load_pretrained(weights) return model register_model def RCViT_T(weights, pretrained_strictFalse, **kwargs): # Assuming T means Tiny or a different variant model RCViT( layers[3, 3, 6, 3], embed_dims[96, 128, 256, 512], mlp_ratios4, downsamples[True, True, True, True], norm_layernn.BatchNorm2d, attn_biasFalse, act_layernn.GELU, fork_featTrue, **kwargs) if weights: model.load_pretrained(weights) return model # def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad)2.2 更改init.py文件关键步骤二在文件ultralytics\ultralytics\nn\modules\models文件夹下新建__init__.py文件先导入函数然后在下面的__all__中声明函数2.3 添加yaml文件关键步骤三在/ultralytics/ultralytics/cfg/models/26下面新建文件yolo26_CAS_ViT.yaml文件粘贴下面的内容目标检测# Ultralytics AGPL-3.0 License - https://ultralytics.com/license # Ultralytics YOLO26 object detection model with P3/8 - P5/32 outputs # Model docs: https://docs.ultralytics.com/models/yolo26 # Task docs: https://docs.ultralytics.com/tasks/detect # Parameters nc: 80 # number of classes end2end: True # whether to use end-to-end mode reg_max: 1 # DFL bins scales: # model compound scaling constants, i.e. modelyolo26n.yaml will call yolo26.yaml with scale n # [depth, width, max_channels] n: [0.50, 0.25, 1024] # summary: 260 layers, 2,572,280 parameters, 2,572,280 gradients, 6.1 GFLOPs s: [0.50, 0.50, 1024] # summary: 260 layers, 10,009,784 parameters, 10,009,784 gradients, 22.8 GFLOPs m: [0.50, 1.00, 512] # summary: 280 layers, 21,896,248 parameters, 21,896,248 gradients, 75.4 GFLOPs l: [1.00, 1.00, 512] # summary: 392 layers, 26,299,704 parameters, 26,299,704 gradients, 93.8 GFLOPs x: [1.00, 1.50, 512] # summary: 392 layers, 58,993,368 parameters, 58,993,368 gradients, 209.5 GFLOPs # YOLO26n backbone backbone: # [from, repeats, module, args] - [-1, 1, RCViT_XS, []] - [-1, 1, SPPF, [1024, 5]] # 5 - [-1, 2, C2PSA, [1024]] # 6 # YOLO26n head head: - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 3], 1, Concat, [1]] # cat backbone P4 - [-1, 2, C3k2, [512, False]] # 9 - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 2], 1, Concat, [1]] # cat backbone P3 - [-1, 2, C3k2, [256, False]] # 12 (P3/8-small) - [-1, 1, Conv, [256, 3, 2]] - [[-1, 9], 1, Concat, [1]] # cat head P4 - [-1, 2, C3k2, [512, False]] # 15 (P4/16-medium) - [-1, 1, Conv, [512, 3, 2]] - [[-1, 6], 1, Concat, [1]] # cat head P5 - [-1, 2, C3k2, [1024, True]] # 18 (P5/32-large) - [[12, 15, 18], 1, Detect, [nc]] # Detect(P3, P4, P5)语义分割# Ultralytics AGPL-3.0 License - https://ultralytics.com/license # Ultralytics YOLO26 object detection model with P3/8 - P5/32 outputs # Model docs: https://docs.ultralytics.com/models/yolo26 # Task docs: https://docs.ultralytics.com/tasks/detect # Parameters nc: 80 # number of classes end2end: True # whether to use end-to-end mode reg_max: 1 # DFL bins scales: # model compound scaling constants, i.e. modelyolo26n.yaml will call yolo26.yaml with scale n # [depth, width, max_channels] n: [0.50, 0.25, 1024] # summary: 260 layers, 2,572,280 parameters, 2,572,280 gradients, 6.1 GFLOPs s: [0.50, 0.50, 1024] # summary: 260 layers, 10,009,784 parameters, 10,009,784 gradients, 22.8 GFLOPs m: [0.50, 1.00, 512] # summary: 280 layers, 21,896,248 parameters, 21,896,248 gradients, 75.4 GFLOPs l: [1.00, 1.00, 512] # summary: 392 layers, 26,299,704 parameters, 26,299,704 gradients, 93.8 GFLOPs x: [1.00, 1.50, 512] # summary: 392 layers, 58,993,368 parameters, 58,993,368 gradients, 209.5 GFLOPs # YOLO26n backbone backbone: # [from, repeats, module, args] - [-1, 1, RCViT_XS, []] - [-1, 1, SPPF, [1024, 5]] # 5 - [-1, 2, C2PSA, [1024]] # 6 # YOLO26n head head: - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 3], 1, Concat, [1]] # cat backbone P4 - [-1, 2, C3k2, [512, False]] # 9 - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 2], 1, Concat, [1]] # cat backbone P3 - [-1, 2, C3k2, [256, False]] # 12 (P3/8-small) - [-1, 1, Conv, [256, 3, 2]] - [[-1, 9], 1, Concat, [1]] # cat head P4 - [-1, 2, C3k2, [512, False]] # 15 (P4/16-medium) - [-1, 1, Conv, [512, 3, 2]] - [[-1, 6], 1, Concat, [1]] # cat head P5 - [-1, 2, C3k2, [1024, True]] # 18 (P5/32-large) - [[12, 15, 18], 1, Segment, [nc, 32, 256]]旋转目标检测# Ultralytics AGPL-3.0 License - https://ultralytics.com/license # Ultralytics YOLO26 object detection model with P3/8 - P5/32 outputs # Model docs: https://docs.ultralytics.com/models/yolo26 # Task docs: https://docs.ultralytics.com/tasks/detect # Parameters nc: 80 # number of classes end2end: True # whether to use end-to-end mode reg_max: 1 # DFL bins scales: # model compound scaling constants, i.e. modelyolo26n.yaml will call yolo26.yaml with scale n # [depth, width, max_channels] n: [0.50, 0.25, 1024] # summary: 260 layers, 2,572,280 parameters, 2,572,280 gradients, 6.1 GFLOPs s: [0.50, 0.50, 1024] # summary: 260 layers, 10,009,784 parameters, 10,009,784 gradients, 22.8 GFLOPs m: [0.50, 1.00, 512] # summary: 280 layers, 21,896,248 parameters, 21,896,248 gradients, 75.4 GFLOPs l: [1.00, 1.00, 512] # summary: 392 layers, 26,299,704 parameters, 26,299,704 gradients, 93.8 GFLOPs x: [1.00, 1.50, 512] # summary: 392 layers, 58,993,368 parameters, 58,993,368 gradients, 209.5 GFLOPs # YOLO26n backbone backbone: # [from, repeats, module, args] - [-1, 1, RCViT_XS, []] - [-1, 1, SPPF, [1024, 5]] # 5 - [-1, 2, C2PSA, [1024]] # 6 # YOLO26n head head: - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 3], 1, Concat, [1]] # cat backbone P4 - [-1, 2, C3k2, [512, False]] # 9 - [-1, 1, nn.Upsample, [None, 2, nearest]] - [[-1, 2], 1, Concat, [1]] # cat backbone P3 - [-1, 2, C3k2, [256, False]] # 12 (P3/8-small) - [-1, 1, Conv, [256, 3, 2]] - [[-1, 9], 1, Concat, [1]] # cat head P4 - [-1, 2, C3k2, [512, False]] # 15 (P4/16-medium) - [-1, 1, Conv, [512, 3, 2]] - [[-1, 6], 1, Concat, [1]] # cat head P5 - [-1, 2, C3k2, [1024, True]] # 18 (P5/32-large) - [[12, 15, 18], 1, OBB, [nc, 1]]温馨提示本文只是对yolo26基础上添加模块如果要对yolo26 n/l/m/x进行添加则只需要指定对应的depth_multiple 和 width_multipleend2end: True # whether to use end-to-end mode reg_max: 1 # DFL bins scales: # model compound scaling constants, i.e. modelyolo26n.yaml will call yolo26.yaml with scale n # [depth, width, max_channels] n: [0.50, 0.25, 1024] # summary: 260 layers, 2,572,280 parameters, 2,572,280 gradients, 6.1 GFLOPs s: [0.50, 0.50, 1024] # summary: 260 layers, 10,009,784 parameters, 10,009,784 gradients, 22.8 GFLOPs m: [0.50, 1.00, 512] # summary: 280 layers, 21,896,248 parameters, 21,896,248 gradients, 75.4 GFLOPs l: [1.00, 1.00, 512] # summary: 392 layers, 26,299,704 parameters, 26,299,704 gradients, 93.8 GFLOPs x: [1.00, 1.50, 512] # summary: 392 layers, 58,993,368 parameters, 58,993,368 gradients, 209.5 GFLOPs2.4 在task.py中进行注册关键步骤四在parse_model函数中进行注册添加CAS_ViT先在task.py导入函数然后在task.py文件下找到parse_model这个函数如下图添加CAS_ViTelif m in {RCViT_XS, RCViT_S, RCViT_M, RCViT_T}: m m(*args) c2 m.width_list backbone True else: c2 ch[f]2.5 执行程序关键步骤五:在ultralytics文件中新建train.py将model的参数路径设置为yolo26_CAS_ViT.yaml的路径即可 【注意是在外边的Ultralytics下新建train.py】from ultralytics import YOLO import warnings warnings.filterwarnings(ignore) from pathlib import Path if __name__ __main__: # 加载模型 model YOLO(ultralytics/cfg/26/yolo26.yaml) # 你要选择的模型yaml文件地址 # Use the model results model.train(datar你的数据集的yaml文件地址, epochs100, batch16, imgsz640, workers4, namePath(model.cfg).stem) # 训练模型运行程序如果出现下面的内容则说明添加成功from n params module arguments 0 -1 1 2759132 RCViT_XS [] 1 -1 1 137572 ultralytics.nn.modules.block.SPPF [220, 256, 5] 2 -1 1 249728 ultralytics.nn.modules.block.C2PSA [256, 256, 1] 3 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, nearest] 4 [-1, 3] 1 0 ultralytics.nn.modules.conv.Concat [1] 5 -1 1 109248 ultralytics.nn.modules.block.C3k2 [368, 128, 1, False] 6 -1 1 0 torch.nn.modules.upsampling.Upsample [None, 2, nearest] 7 [-1, 2] 1 0 ultralytics.nn.modules.conv.Concat [1] 8 -1 1 27488 ultralytics.nn.modules.block.C3k2 [184, 64, 1, False] 9 -1 1 36992 ultralytics.nn.modules.conv.Conv [64, 64, 3, 2] 10 [-1, 9] 1 0 ultralytics.nn.modules.conv.Concat [1] 11 -1 1 86720 ultralytics.nn.modules.block.C3k2 [192, 128, 1, False] 12 -1 1 147712 ultralytics.nn.modules.conv.Conv [128, 128, 3, 2] 13 [-1, 6] 1 0 ultralytics.nn.modules.conv.Concat [1] 14 -1 1 378880 ultralytics.nn.modules.block.C3k2 [384, 256, 1, True] 15 [12, 15, 18] 1 309656 ultralytics.nn.modules.head.Detect [80, 1, True, [64, 128, 256]] YOLO26_CAS_ViT summary: 500 layers, 4,243,128 parameters, 4,243,128 gradients, 12.2 GFLOPs3. 完整代码分享主页侧边4. GFLOPs关于GFLOPs的计算方式可以查看百面算法工程师 | 卷积基础知识——Convolution未改进的YOLO26n GFLOPs改进后的GFLOPs5. 进阶可以与其他的注意力机制或者损失函数等结合进一步提升检测效果6.总结通过以上的改进方法我们成功提升了模型的表现。这只是一个开始未来还有更多优化和技术深挖的空间。在这里我想隆重向大家推荐我的专栏——专栏地址YOLO26改进-论文涨点——点击跳转看所有内容关注不迷路。这个专栏专注于前沿的深度学习技术特别是目标检测领域的最新进展不仅包含对YOLO26的深入解析和改进策略还会定期更新来自各大顶会如CVPR、NeurIPS等的论文复现和实战分享。为什么订阅我的专栏——专栏地址YOLO26改进-论文涨点——点击跳转看所有内容关注不迷路前沿技术解读专栏不仅限于YOLO系列的改进还会涵盖各类主流与新兴网络的最新研究成果帮助你紧跟技术潮流。详尽的实践分享所有内容实践性也极强。每次更新都会附带代码和具体的改进步骤保证每位读者都能迅速上手。问题互动与答疑订阅我的专栏后你将可以随时向我提问获取及时的答疑。实时更新紧跟行业动态不定期发布来自全球顶会的最新研究方向和复现实验报告让你时刻走在技术前沿。专栏适合人群对目标检测、YOLO系列网络有深厚兴趣的同学希望在用YOLO算法写论文的同学对YOLO算法感兴趣的同学等