sam2_hiera_s.yaml 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. # @package _global_
  2. # Model
  3. model:
  4. _target_: sam2.modeling.sam2_base.SAM2Base
  5. image_encoder:
  6. _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
  7. scalp: 1
  8. trunk:
  9. _target_: sam2.modeling.backbones.hieradet.Hiera
  10. embed_dim: 96
  11. num_heads: 1
  12. stages: [1, 2, 11, 2]
  13. global_att_blocks: [7, 10, 13]
  14. window_pos_embed_bkg_spatial_size: [7, 7]
  15. neck:
  16. _target_: sam2.modeling.backbones.image_encoder.FpnNeck
  17. position_encoding:
  18. _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
  19. num_pos_feats: 256
  20. normalize: true
  21. scale: null
  22. temperature: 10000
  23. d_model: 256
  24. backbone_channel_list: [768, 384, 192, 96]
  25. fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
  26. fpn_interp_model: nearest
  27. memory_attention:
  28. _target_: sam2.modeling.memory_attention.MemoryAttention
  29. d_model: 256
  30. pos_enc_at_input: true
  31. layer:
  32. _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
  33. activation: relu
  34. dim_feedforward: 2048
  35. dropout: 0.1
  36. pos_enc_at_attn: false
  37. self_attention:
  38. _target_: sam2.modeling.sam.transformer.RoPEAttention
  39. rope_theta: 10000.0
  40. feat_sizes: [32, 32]
  41. embedding_dim: 256
  42. num_heads: 1
  43. downsample_rate: 1
  44. dropout: 0.1
  45. d_model: 256
  46. pos_enc_at_cross_attn_keys: true
  47. pos_enc_at_cross_attn_queries: false
  48. cross_attention:
  49. _target_: sam2.modeling.sam.transformer.RoPEAttention
  50. rope_theta: 10000.0
  51. feat_sizes: [32, 32]
  52. rope_k_repeat: True
  53. embedding_dim: 256
  54. num_heads: 1
  55. downsample_rate: 1
  56. dropout: 0.1
  57. kv_in_dim: 64
  58. num_layers: 4
  59. memory_encoder:
  60. _target_: sam2.modeling.memory_encoder.MemoryEncoder
  61. out_dim: 64
  62. position_encoding:
  63. _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
  64. num_pos_feats: 64
  65. normalize: true
  66. scale: null
  67. temperature: 10000
  68. mask_downsampler:
  69. _target_: sam2.modeling.memory_encoder.MaskDownSampler
  70. kernel_size: 3
  71. stride: 2
  72. padding: 1
  73. fuser:
  74. _target_: sam2.modeling.memory_encoder.Fuser
  75. layer:
  76. _target_: sam2.modeling.memory_encoder.CXBlock
  77. dim: 256
  78. kernel_size: 7
  79. padding: 3
  80. layer_scale_init_value: 1e-6
  81. use_dwconv: True # depth-wise convs
  82. num_layers: 2
  83. num_maskmem: 7
  84. image_size: 1024
  85. # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
  86. sigmoid_scale_for_mem_enc: 20.0
  87. sigmoid_bias_for_mem_enc: -10.0
  88. use_mask_input_as_output_without_sam: true
  89. # Memory
  90. directly_add_no_mem_embed: true
  91. # use high-resolution feature map in the SAM mask decoder
  92. use_high_res_features_in_sam: true
  93. # output 3 masks on the first click on initial conditioning frames
  94. multimask_output_in_sam: true
  95. # SAM heads
  96. iou_prediction_use_sigmoid: True
  97. # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
  98. use_obj_ptrs_in_encoder: true
  99. add_tpos_enc_to_obj_ptrs: false
  100. only_obj_ptrs_in_the_past_for_eval: true
  101. # object occlusion prediction
  102. pred_obj_scores: true
  103. pred_obj_scores_mlp: true
  104. fixed_no_obj_ptr: true
  105. # multimask tracking settings
  106. multimask_output_for_tracking: true
  107. use_multimask_token_for_obj_ptr: true
  108. multimask_min_pt_num: 0
  109. multimask_max_pt_num: 1
  110. use_mlp_for_obj_ptr_proj: true
  111. # Compilation flag
  112. compile_image_encoder: False