Procházet zdrojové kódy

open `README.md` with unicode (to support Hugging Face emoji); fix various typos (#218)

(close #217, #66, #67, #69, #91, #126, #127, #145)
Ronghang Hu před 1 rokem
rodič
revize
7e1596c0b6

+ 1 - 1
sam2/modeling/position_encoding.py

@@ -16,7 +16,7 @@ from torch import nn
 class PositionEmbeddingSine(nn.Module):
 class PositionEmbeddingSine(nn.Module):
     """
     """
     This is a more standard version of the position embedding, very similar to the one
     This is a more standard version of the position embedding, very similar to the one
-    used by the Attention is all you need paper, generalized to work on images.
+    used by the Attention Is All You Need paper, generalized to work on images.
     """
     """
 
 
     def __init__(
     def __init__(

+ 1 - 1
sam2/modeling/sam2_base.py

@@ -642,7 +642,7 @@ class SAM2Base(torch.nn.Module):
                 pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
                 pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W)
                 return pix_feat_with_mem
                 return pix_feat_with_mem
 
 
-            # Use a dummy token on the first frame (to avoid emtpy memory input to tranformer encoder)
+            # Use a dummy token on the first frame (to avoid empty memory input to tranformer encoder)
             to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
             to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)]
             to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
             to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)]
 
 

+ 1 - 1
sam2/sam2_image_predictor.py

@@ -183,7 +183,7 @@ class SAM2ImagePredictor:
         normalize_coords=True,
         normalize_coords=True,
     ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
     ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
         """This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images.
         """This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images.
-        It returns a tupele of lists of masks, ious, and low_res_masks_logits.
+        It returns a tuple of lists of masks, ious, and low_res_masks_logits.
         """
         """
         assert self._is_batch, "This function should only be used when in batched mode"
         assert self._is_batch, "This function should only be used when in batched mode"
         if not self._is_image_set:
         if not self._is_image_set:

+ 3 - 3
sam2/sam2_video_predictor.py

@@ -44,7 +44,7 @@ class SAM2VideoPredictor(SAM2Base):
         offload_state_to_cpu=False,
         offload_state_to_cpu=False,
         async_loading_frames=False,
         async_loading_frames=False,
     ):
     ):
-        """Initialize a inference state."""
+        """Initialize an inference state."""
         compute_device = self.device  # device of the model
         compute_device = self.device  # device of the model
         images, video_height, video_width = load_video_frames(
         images, video_height, video_width = load_video_frames(
             video_path=video_path,
             video_path=video_path,
@@ -589,7 +589,7 @@ class SAM2VideoPredictor(SAM2Base):
         # to `propagate_in_video_preflight`).
         # to `propagate_in_video_preflight`).
         consolidated_frame_inds = inference_state["consolidated_frame_inds"]
         consolidated_frame_inds = inference_state["consolidated_frame_inds"]
         for is_cond in [False, True]:
         for is_cond in [False, True]:
-            # Separately consolidate conditioning and non-conditioning temp outptus
+            # Separately consolidate conditioning and non-conditioning temp outputs
             storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
             storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
             # Find all the frames that contain temporary outputs for any objects
             # Find all the frames that contain temporary outputs for any objects
             # (these should be the frames that have just received clicks for mask inputs
             # (these should be the frames that have just received clicks for mask inputs
@@ -598,7 +598,7 @@ class SAM2VideoPredictor(SAM2Base):
             for obj_temp_output_dict in temp_output_dict_per_obj.values():
             for obj_temp_output_dict in temp_output_dict_per_obj.values():
                 temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
                 temp_frame_inds.update(obj_temp_output_dict[storage_key].keys())
             consolidated_frame_inds[storage_key].update(temp_frame_inds)
             consolidated_frame_inds[storage_key].update(temp_frame_inds)
-            # consolidate the temprary output across all objects on this frame
+            # consolidate the temporary output across all objects on this frame
             for frame_idx in temp_frame_inds:
             for frame_idx in temp_frame_inds:
                 consolidated_out = self._consolidate_temp_output_across_obj(
                 consolidated_out = self._consolidate_temp_output_across_obj(
                     inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True
                     inference_state, frame_idx, is_cond=is_cond, run_mem_encoder=True

+ 2 - 2
sam2/utils/misc.py

@@ -68,7 +68,7 @@ def mask_to_box(masks: torch.Tensor):
     compute bounding box given an input mask
     compute bounding box given an input mask
 
 
     Inputs:
     Inputs:
-    - masks: [B, 1, H, W] boxes, dtype=torch.Tensor
+    - masks: [B, 1, H, W] masks, dtype=torch.Tensor
 
 
     Returns:
     Returns:
     - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
     - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor
@@ -120,7 +120,7 @@ class AsyncVideoFrameLoader:
         self.offload_video_to_cpu = offload_video_to_cpu
         self.offload_video_to_cpu = offload_video_to_cpu
         self.img_mean = img_mean
         self.img_mean = img_mean
         self.img_std = img_std
         self.img_std = img_std
-        # items in `self._images` will be loaded asynchronously
+        # items in `self.images` will be loaded asynchronously
         self.images = [None] * len(img_paths)
         self.images = [None] * len(img_paths)
         # catch and raise any exceptions in the async loading thread
         # catch and raise any exceptions in the async loading thread
         self.exception = None
         self.exception = None

+ 1 - 1
sav_dataset/sav_evaluator.py

@@ -72,7 +72,7 @@ parser.add_argument(
 parser.add_argument(
 parser.add_argument(
     "--do_not_skip_first_and_last_frame",
     "--do_not_skip_first_and_last_frame",
     help="In SA-V val and test, we skip the first and the last annotated frames in evaluation. "
     help="In SA-V val and test, we skip the first and the last annotated frames in evaluation. "
-    "Set this to true for evaluation on settings that doen't skip first and last frames",
+    "Set this to true for evaluation on settings that doesn't skip first and last frames",
     action="store_true",
     action="store_true",
 )
 )
 
 

+ 1 - 1
sav_dataset/utils/sav_benchmark.py

@@ -183,7 +183,7 @@ def _seg2bmap(seg, width=None, height=None):
 
 
     assert not (
     assert not (
         width > w | height > h | abs(ar1 - ar2) > 0.01
         width > w | height > h | abs(ar1 - ar2) > 0.01
-    ), "Can" "t convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
+    ), "Cannot convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
 
 
     e = np.zeros_like(seg)
     e = np.zeros_like(seg)
     s = np.zeros_like(seg)
     s = np.zeros_like(seg)

+ 1 - 1
setup.py

@@ -17,7 +17,7 @@ AUTHOR_EMAIL = "segment-anything@meta.com"
 LICENSE = "Apache 2.0"
 LICENSE = "Apache 2.0"
 
 
 # Read the contents of README file
 # Read the contents of README file
-with open("README.md", "r") as f:
+with open("README.md", "r", encoding="utf-8") as f:
     LONG_DESCRIPTION = f.read()
     LONG_DESCRIPTION = f.read()
 
 
 # Required dependencies
 # Required dependencies