sam2_utils.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
  2. # All rights reserved.
  3. # pyre-unsafe
  4. # This source code is licensed under the license found in the
  5. # LICENSE file in the root directory of this source tree.
  6. import os
  7. from threading import Thread
  8. import numpy as np
  9. import torch
  10. from PIL import Image
  11. from tqdm import tqdm
  12. def _load_img_as_tensor(img_path, image_size):
  13. img_pil = Image.open(img_path)
  14. img_np = np.array(img_pil.convert("RGB").resize((image_size, image_size)))
  15. if img_np.dtype == np.uint8: # np.uint8 is expected for JPEG images
  16. img_np = img_np / 255.0
  17. else:
  18. raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {img_path}")
  19. img = torch.from_numpy(img_np).permute(2, 0, 1)
  20. video_width, video_height = img_pil.size # the original video size
  21. return img, video_height, video_width
  22. class AsyncVideoFrameLoader:
  23. """
  24. A list of video frames to be load asynchronously without blocking session start.
  25. """
  26. def __init__(
  27. self,
  28. img_paths,
  29. image_size,
  30. offload_video_to_cpu,
  31. img_mean,
  32. img_std,
  33. compute_device,
  34. ):
  35. self.img_paths = img_paths
  36. self.image_size = image_size
  37. self.offload_video_to_cpu = offload_video_to_cpu
  38. self.img_mean = img_mean
  39. self.img_std = img_std
  40. # items in `self.images` will be loaded asynchronously
  41. self.images = [None] * len(img_paths)
  42. # catch and raise any exceptions in the async loading thread
  43. self.exception = None
  44. # video_height and video_width be filled when loading the first image
  45. self.video_height = None
  46. self.video_width = None
  47. self.compute_device = compute_device
  48. # load the first frame to fill video_height and video_width and also
  49. # to cache it (since it's most likely where the user will click)
  50. self.__getitem__(0)
  51. # load the rest of frames asynchronously without blocking the session start
  52. def _load_frames():
  53. try:
  54. for n in tqdm(range(len(self.images)), desc="frame loading (JPEG)"):
  55. self.__getitem__(n)
  56. except Exception as e:
  57. self.exception = e
  58. self.thread = Thread(target=_load_frames, daemon=True)
  59. self.thread.start()
  60. def __getitem__(self, index):
  61. if self.exception is not None:
  62. raise RuntimeError("Failure in frame loading thread") from self.exception
  63. img = self.images[index]
  64. if img is not None:
  65. return img
  66. img, video_height, video_width = _load_img_as_tensor(
  67. self.img_paths[index], self.image_size
  68. )
  69. self.video_height = video_height
  70. self.video_width = video_width
  71. # normalize by mean and std
  72. img -= self.img_mean
  73. img /= self.img_std
  74. if not self.offload_video_to_cpu:
  75. img = img.to(self.compute_device, non_blocking=True)
  76. self.images[index] = img
  77. return img
  78. def __len__(self):
  79. return len(self.images)
  80. def load_video_frames(
  81. video_path,
  82. image_size,
  83. offload_video_to_cpu,
  84. img_mean=(0.5, 0.5, 0.5),
  85. img_std=(0.5, 0.5, 0.5),
  86. async_loading_frames=False,
  87. compute_device=torch.device("cuda"),
  88. ):
  89. """
  90. Load the video frames from video_path. The frames are resized to image_size as in
  91. the model and are loaded to GPU if offload_video_to_cpu=False. This is used by the demo.
  92. """
  93. is_bytes = isinstance(video_path, bytes)
  94. is_str = isinstance(video_path, str)
  95. is_mp4_path = is_str and os.path.splitext(video_path)[-1] in [".mp4", ".MP4"]
  96. if is_bytes or is_mp4_path:
  97. return load_video_frames_from_video_file(
  98. video_path=video_path,
  99. image_size=image_size,
  100. offload_video_to_cpu=offload_video_to_cpu,
  101. img_mean=img_mean,
  102. img_std=img_std,
  103. compute_device=compute_device,
  104. )
  105. elif is_str and os.path.isdir(video_path):
  106. return load_video_frames_from_jpg_images(
  107. video_path=video_path,
  108. image_size=image_size,
  109. offload_video_to_cpu=offload_video_to_cpu,
  110. img_mean=img_mean,
  111. img_std=img_std,
  112. async_loading_frames=async_loading_frames,
  113. compute_device=compute_device,
  114. )
  115. else:
  116. raise NotImplementedError(
  117. "Only MP4 video and JPEG folder are supported at this moment"
  118. )
  119. def load_video_frames_from_jpg_images(
  120. video_path,
  121. image_size,
  122. offload_video_to_cpu,
  123. img_mean=(0.5, 0.5, 0.5),
  124. img_std=(0.5, 0.5, 0.5),
  125. async_loading_frames=False,
  126. compute_device=torch.device("cuda"),
  127. ):
  128. """
  129. Load the video frames from a directory of JPEG files ("<frame_index>.jpg" format).
  130. The frames are resized to image_size x image_size and are loaded to GPU if
  131. `offload_video_to_cpu` is `False` and to CPU if `offload_video_to_cpu` is `True`.
  132. You can load a frame asynchronously by setting `async_loading_frames` to `True`.
  133. """
  134. if isinstance(video_path, str) and os.path.isdir(video_path):
  135. jpg_folder = video_path
  136. else:
  137. raise NotImplementedError(
  138. "Only JPEG frames are supported at this moment. For video files, you may use "
  139. "ffmpeg (https://ffmpeg.org/) to extract frames into a folder of JPEG files, such as \n"
  140. "```\n"
  141. "ffmpeg -i <your_video>.mp4 -q:v 2 -start_number 0 <output_dir>/'%05d.jpg'\n"
  142. "```\n"
  143. "where `-q:v` generates high-quality JPEG frames and `-start_number 0` asks "
  144. "ffmpeg to start the JPEG file from 00000.jpg."
  145. )
  146. frame_names = [
  147. p
  148. for p in os.listdir(jpg_folder)
  149. if os.path.splitext(p)[-1] in [".jpg", ".jpeg", ".JPG", ".JPEG"]
  150. ]
  151. frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
  152. num_frames = len(frame_names)
  153. if num_frames == 0:
  154. raise RuntimeError(f"no images found in {jpg_folder}")
  155. img_paths = [os.path.join(jpg_folder, frame_name) for frame_name in frame_names]
  156. img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
  157. img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
  158. if async_loading_frames:
  159. lazy_images = AsyncVideoFrameLoader(
  160. img_paths,
  161. image_size,
  162. offload_video_to_cpu,
  163. img_mean,
  164. img_std,
  165. compute_device,
  166. )
  167. return lazy_images, lazy_images.video_height, lazy_images.video_width
  168. images = torch.zeros(num_frames, 3, image_size, image_size, dtype=torch.float32)
  169. for n, img_path in enumerate(tqdm(img_paths, desc="frame loading (JPEG)")):
  170. images[n], video_height, video_width = _load_img_as_tensor(img_path, image_size)
  171. if not offload_video_to_cpu:
  172. images = images.to(compute_device)
  173. img_mean = img_mean.to(compute_device)
  174. img_std = img_std.to(compute_device)
  175. # normalize by mean and std
  176. images -= img_mean
  177. images /= img_std
  178. return images, video_height, video_width
  179. def load_video_frames_from_video_file(
  180. video_path,
  181. image_size,
  182. offload_video_to_cpu,
  183. img_mean=(0.5, 0.5, 0.5),
  184. img_std=(0.5, 0.5, 0.5),
  185. compute_device=torch.device("cuda"),
  186. ):
  187. """Load the video frames from a video file."""
  188. import decord
  189. img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None]
  190. img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None]
  191. # Get the original video height and width
  192. decord.bridge.set_bridge("torch")
  193. video_height, video_width, _ = decord.VideoReader(video_path).next().shape
  194. # Iterate over all frames in the video
  195. images = []
  196. for frame in decord.VideoReader(video_path, width=image_size, height=image_size):
  197. images.append(frame.permute(2, 0, 1))
  198. images = torch.stack(images, dim=0).float() / 255.0
  199. if not offload_video_to_cpu:
  200. images = images.to(compute_device)
  201. img_mean = img_mean.to(compute_device)
  202. img_std = img_std.to(compute_device)
  203. # normalize by mean and std
  204. images -= img_mean
  205. images /= img_std
  206. return images, video_height, video_width