sam2.1_hiera_l.yaml 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. # @package _global_
  2. # Model
  3. model:
  4. _target_: sam2.modeling.sam2_base.SAM2Base
  5. image_encoder:
  6. _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  7. scalp: 1
  8. trunk:
  9. _target_: sam2.modeling.backbones.hieradet.Hiera
  10. embed_dim: 144
  11. num_heads: 2
  12. stages: [2, 6, 36, 4]
  13. global_att_blocks: [23, 33, 43]
  14. window_pos_embed_bkg_spatial_size: [7, 7]
  15. window_spec: [8, 4, 16, 8]
  16. neck:
  17. _target_: sam2.modeling.backbones.image_encoder.FpnNeck
  18. position_encoding:
  19. _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
  20. num_pos_feats: 256
  21. normalize: true
  22. scale: null
  23. temperature: 10000
  24. d_model: 256
  25. backbone_channel_list: [1152, 576, 288, 144]
  26. fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
  27. fpn_interp_model: nearest
  28. memory_attention:
  29. _target_: sam2.modeling.memory_attention.MemoryAttention
  30. d_model: 256
  31. pos_enc_at_input: true
  32. layer:
  33. _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
  34. activation: relu
  35. dim_feedforward: 2048
  36. dropout: 0.1
  37. pos_enc_at_attn: false
  38. self_attention:
  39. _target_: sam2.modeling.sam.transformer.RoPEAttention
  40. rope_theta: 10000.0
  41. feat_sizes: [64, 64]
  42. embedding_dim: 256
  43. num_heads: 1
  44. downsample_rate: 1
  45. dropout: 0.1
  46. d_model: 256
  47. pos_enc_at_cross_attn_keys: true
  48. pos_enc_at_cross_attn_queries: false
  49. cross_attention:
  50. _target_: sam2.modeling.sam.transformer.RoPEAttention
  51. rope_theta: 10000.0
  52. feat_sizes: [64, 64]
  53. rope_k_repeat: True
  54. embedding_dim: 256
  55. num_heads: 1
  56. downsample_rate: 1
  57. dropout: 0.1
  58. kv_in_dim: 64
  59. num_layers: 4
  60. memory_encoder:
  61. _target_: sam2.modeling.memory_encoder.MemoryEncoder
  62. out_dim: 64
  63. position_encoding:
  64. _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
  65. num_pos_feats: 64
  66. normalize: true
  67. scale: null
  68. temperature: 10000
  69. mask_downsampler:
  70. _target_: sam2.modeling.memory_encoder.MaskDownSampler
  71. kernel_size: 3
  72. stride: 2
  73. padding: 1
  74. fuser:
  75. _target_: sam2.modeling.memory_encoder.Fuser
  76. layer:
  77. _target_: sam2.modeling.memory_encoder.CXBlock
  78. dim: 256
  79. kernel_size: 7
  80. padding: 3
  81. layer_scale_init_value: 1e-6
  82. use_dwconv: True # depth-wise convs
  83. num_layers: 2
  84. num_maskmem: 7
  85. image_size: 1024
  86. # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
  87. sigmoid_scale_for_mem_enc: 20.0
  88. sigmoid_bias_for_mem_enc: -10.0
  89. use_mask_input_as_output_without_sam: true
  90. # Memory
  91. directly_add_no_mem_embed: true
  92. no_obj_embed_spatial: true
  93. # use high-resolution feature map in the SAM mask decoder
  94. use_high_res_features_in_sam: true
  95. # output 3 masks on the first click on initial conditioning frames
  96. multimask_output_in_sam: true
  97. # SAM heads
  98. iou_prediction_use_sigmoid: True
  99. # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
  100. use_obj_ptrs_in_encoder: true
  101. add_tpos_enc_to_obj_ptrs: true
  102. proj_tpos_enc_in_obj_ptrs: true
  103. use_signed_tpos_enc_to_obj_ptrs: true
  104. only_obj_ptrs_in_the_past_for_eval: true
  105. # object occlusion prediction
  106. pred_obj_scores: true
  107. pred_obj_scores_mlp: true
  108. fixed_no_obj_ptr: true
  109. # multimask tracking settings
  110. multimask_output_for_tracking: true
  111. use_multimask_token_for_obj_ptr: true
  112. multimask_min_pt_num: 0
  113. multimask_max_pt_num: 1
  114. use_mlp_for_obj_ptr_proj: true
  115. # Compilation flag
  116. compile_image_encoder: False