sam2.1_hiera_b+.yaml 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. # @package _global_
  2. # Model
  3. model:
  4. _target_: sam2.modeling.sam2_base.SAM2Base
  5. image_encoder:
  6. _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  7. scalp: 1
  8. trunk:
  9. _target_: sam2.modeling.backbones.hieradet.Hiera
  10. embed_dim: 112
  11. num_heads: 2
  12. neck:
  13. _target_: sam2.modeling.backbones.image_encoder.FpnNeck
  14. position_encoding:
  15. _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
  16. num_pos_feats: 256
  17. normalize: true
  18. scale: null
  19. temperature: 10000
  20. d_model: 256
  21. backbone_channel_list: [896, 448, 224, 112]
  22. fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
  23. fpn_interp_model: nearest
  24. memory_attention:
  25. _target_: sam2.modeling.memory_attention.MemoryAttention
  26. d_model: 256
  27. pos_enc_at_input: true
  28. layer:
  29. _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
  30. activation: relu
  31. dim_feedforward: 2048
  32. dropout: 0.1
  33. pos_enc_at_attn: false
  34. self_attention:
  35. _target_: sam2.modeling.sam.transformer.RoPEAttention
  36. rope_theta: 10000.0
  37. feat_sizes: [64, 64]
  38. embedding_dim: 256
  39. num_heads: 1
  40. downsample_rate: 1
  41. dropout: 0.1
  42. d_model: 256
  43. pos_enc_at_cross_attn_keys: true
  44. pos_enc_at_cross_attn_queries: false
  45. cross_attention:
  46. _target_: sam2.modeling.sam.transformer.RoPEAttention
  47. rope_theta: 10000.0
  48. feat_sizes: [64, 64]
  49. rope_k_repeat: True
  50. embedding_dim: 256
  51. num_heads: 1
  52. downsample_rate: 1
  53. dropout: 0.1
  54. kv_in_dim: 64
  55. num_layers: 4
  56. memory_encoder:
  57. _target_: sam2.modeling.memory_encoder.MemoryEncoder
  58. out_dim: 64
  59. position_encoding:
  60. _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
  61. num_pos_feats: 64
  62. normalize: true
  63. scale: null
  64. temperature: 10000
  65. mask_downsampler:
  66. _target_: sam2.modeling.memory_encoder.MaskDownSampler
  67. kernel_size: 3
  68. stride: 2
  69. padding: 1
  70. fuser:
  71. _target_: sam2.modeling.memory_encoder.Fuser
  72. layer:
  73. _target_: sam2.modeling.memory_encoder.CXBlock
  74. dim: 256
  75. kernel_size: 7
  76. padding: 3
  77. layer_scale_init_value: 1e-6
  78. use_dwconv: True # depth-wise convs
  79. num_layers: 2
  80. num_maskmem: 7
  81. image_size: 1024
  82. # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
  83. sigmoid_scale_for_mem_enc: 20.0
  84. sigmoid_bias_for_mem_enc: -10.0
  85. use_mask_input_as_output_without_sam: true
  86. # Memory
  87. directly_add_no_mem_embed: true
  88. no_obj_embed_spatial: true
  89. # use high-resolution feature map in the SAM mask decoder
  90. use_high_res_features_in_sam: true
  91. # output 3 masks on the first click on initial conditioning frames
  92. multimask_output_in_sam: true
  93. # SAM heads
  94. iou_prediction_use_sigmoid: True
  95. # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
  96. use_obj_ptrs_in_encoder: true
  97. add_tpos_enc_to_obj_ptrs: true
  98. proj_tpos_enc_in_obj_ptrs: true
  99. use_signed_tpos_enc_to_obj_ptrs: true
  100. only_obj_ptrs_in_the_past_for_eval: true
  101. # object occlusion prediction
  102. pred_obj_scores: true
  103. pred_obj_scores_mlp: true
  104. fixed_no_obj_ptr: true
  105. # multimask tracking settings
  106. multimask_output_for_tracking: true
  107. use_multimask_token_for_obj_ptr: true
  108. multimask_min_pt_num: 0
  109. multimask_max_pt_num: 1
  110. use_mlp_for_obj_ptr_proj: true
  111. # Compilation flag
  112. compile_image_encoder: False