sav_benchmark.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # All rights reserved.
  3. # This source code is licensed under the license found in the
  4. # LICENSE file in the sav_dataset directory of this source tree.
  5. # adapted from https://github.com/hkchengrex/vos-benchmark
  6. # and https://github.com/davisvideochallenge/davis2017-evaluation
  7. # with their licenses found in the LICENSE_VOS_BENCHMARK and LICENSE_DAVIS files
  8. # in the sav_dataset directory.
  9. import math
  10. import os
  11. import time
  12. from collections import defaultdict
  13. from multiprocessing import Pool
  14. from os import path
  15. from typing import Dict, List, Tuple
  16. import cv2
  17. import numpy as np
  18. import tqdm
  19. from PIL import Image
  20. from skimage.morphology import disk
  21. class VideoEvaluator:
  22. def __init__(self, gt_root, pred_root, skip_first_and_last=True) -> None:
  23. """
  24. gt_root: path to the folder storing the gt masks
  25. pred_root: path to the folder storing the predicted masks
  26. skip_first_and_last: whether we should skip the evaluation of the first and the last frame.
  27. True for SA-V val and test, same as in DAVIS semi-supervised evaluation.
  28. """
  29. self.gt_root = gt_root
  30. self.pred_root = pred_root
  31. self.skip_first_and_last = skip_first_and_last
  32. def __call__(self, vid_name: str) -> Tuple[str, Dict[str, float], Dict[str, float]]:
  33. """
  34. vid_name: name of the video to evaluate
  35. """
  36. # scan the folder to find subfolders for evaluation and
  37. # check if the folder structure is SA-V
  38. to_evaluate, is_sav_format = self.scan_vid_folder(vid_name)
  39. # evaluate each (gt_path, pred_path) pair
  40. eval_results = []
  41. for all_frames, obj_id, gt_path, pred_path in to_evaluate:
  42. if self.skip_first_and_last:
  43. # skip the first and the last frames
  44. all_frames = all_frames[1:-1]
  45. evaluator = Evaluator(name=vid_name, obj_id=obj_id)
  46. for frame in all_frames:
  47. gt_array, pred_array = self.get_gt_and_pred(
  48. gt_path, pred_path, frame, is_sav_format
  49. )
  50. evaluator.feed_frame(mask=pred_array, gt=gt_array)
  51. iou, boundary_f = evaluator.conclude()
  52. eval_results.append((obj_id, iou, boundary_f))
  53. if is_sav_format:
  54. iou_output, boundary_f_output = self.consolidate(eval_results)
  55. else:
  56. assert len(eval_results) == 1
  57. iou_output = eval_results[0][1]
  58. boundary_f_output = eval_results[0][2]
  59. return vid_name, iou_output, boundary_f_output
  60. def get_gt_and_pred(
  61. self,
  62. gt_path: str,
  63. pred_path: str,
  64. f_name: str,
  65. is_sav_format: bool,
  66. ) -> Tuple[np.ndarray, np.ndarray]:
  67. """
  68. Get the ground-truth and predicted masks for a single frame.
  69. """
  70. gt_mask_path = path.join(gt_path, f_name)
  71. pred_mask_path = path.join(pred_path, f_name)
  72. assert os.path.exists(pred_mask_path), f"{pred_mask_path} not found"
  73. gt_array = np.array(Image.open(gt_mask_path))
  74. pred_array = np.array(Image.open(pred_mask_path))
  75. assert (
  76. gt_array.shape[-2:] == pred_array.shape[-2:]
  77. ), f"shape mismatch: {gt_mask_path}, {pred_mask_path}"
  78. if is_sav_format:
  79. assert len(np.unique(gt_array)) <= 2, (
  80. f"found more than 1 object in {gt_mask_path} "
  81. "SA-V format assumes one object mask per png file."
  82. )
  83. assert len(np.unique(pred_array)) <= 2, (
  84. f"found more than 1 object in {pred_mask_path} "
  85. "SA-V format assumes one object mask per png file."
  86. )
  87. gt_array = gt_array > 0
  88. pred_array = pred_array > 0
  89. return gt_array, pred_array
  90. def scan_vid_folder(self, vid_name) -> Tuple[List, bool]:
  91. """
  92. Scan the folder structure of the video and return a list of folders for evaluate.
  93. """
  94. vid_gt_path = path.join(self.gt_root, vid_name)
  95. vid_pred_path = path.join(self.pred_root, vid_name)
  96. all_files_and_dirs = sorted(os.listdir(vid_gt_path))
  97. to_evaluate = []
  98. if all(name.endswith(".png") for name in all_files_and_dirs):
  99. # All files are png files, dataset structure similar to DAVIS
  100. is_sav_format = False
  101. frames = all_files_and_dirs
  102. obj_dir = None
  103. to_evaluate.append((frames, obj_dir, vid_gt_path, vid_pred_path))
  104. else:
  105. # SA-V dataset structure, going one layer down into each subdirectory
  106. is_sav_format = True
  107. for obj_dir in all_files_and_dirs:
  108. obj_gt_path = path.join(vid_gt_path, obj_dir)
  109. obj_pred_path = path.join(vid_pred_path, obj_dir)
  110. frames = sorted(os.listdir(obj_gt_path))
  111. to_evaluate.append((frames, obj_dir, obj_gt_path, obj_pred_path))
  112. return to_evaluate, is_sav_format
  113. def consolidate(
  114. self, eval_results
  115. ) -> Tuple[str, Dict[str, float], Dict[str, float]]:
  116. """
  117. Consolidate the results of all the objects from the video into one dictionary.
  118. """
  119. iou_output = {}
  120. boundary_f_output = {}
  121. for obj_id, iou, boundary_f in eval_results:
  122. assert len(iou) == 1
  123. key = list(iou.keys())[0]
  124. iou_output[obj_id] = iou[key]
  125. boundary_f_output[obj_id] = boundary_f[key]
  126. return iou_output, boundary_f_output
  127. #################################################################################################################
  128. # Functions below are from https://github.com/hkchengrex/vos-benchmark with minor modifications
  129. # _seg2bmap from https://github.com/hkchengrex/vos-benchmark/blob/main/vos_benchmark/utils.py
  130. # get_iou and Evaluator from https://github.com/hkchengrex/vos-benchmark/blob/main/vos_benchmark/evaluator.py
  131. # benchmark from https://github.com/hkchengrex/vos-benchmark/blob/main/vos_benchmark/benchmark.py with slight mod
  132. #################################################################################################################
  133. def _seg2bmap(seg, width=None, height=None):
  134. """
  135. From a segmentation, compute a binary boundary map with 1 pixel wide
  136. boundaries. The boundary pixels are offset by 1/2 pixel towards the
  137. origin from the actual segment boundary.
  138. Arguments:
  139. seg : Segments labeled from 1..k.
  140. width : Width of desired bmap <= seg.shape[1]
  141. height : Height of desired bmap <= seg.shape[0]
  142. Returns:
  143. bmap (ndarray): Binary boundary map.
  144. David Martin <dmartin@eecs.berkeley.edu>
  145. January 2003
  146. """
  147. seg = seg.astype(bool)
  148. seg[seg > 0] = 1
  149. assert np.atleast_3d(seg).shape[2] == 1
  150. width = seg.shape[1] if width is None else width
  151. height = seg.shape[0] if height is None else height
  152. h, w = seg.shape[:2]
  153. ar1 = float(width) / float(height)
  154. ar2 = float(w) / float(h)
  155. assert not (
  156. width > w | height > h | abs(ar1 - ar2) > 0.01
  157. ), "Cannot convert %dx%d seg to %dx%d bmap." % (w, h, width, height)
  158. e = np.zeros_like(seg)
  159. s = np.zeros_like(seg)
  160. se = np.zeros_like(seg)
  161. e[:, :-1] = seg[:, 1:]
  162. s[:-1, :] = seg[1:, :]
  163. se[:-1, :-1] = seg[1:, 1:]
  164. b = seg ^ e | seg ^ s | seg ^ se
  165. b[-1, :] = seg[-1, :] ^ e[-1, :]
  166. b[:, -1] = seg[:, -1] ^ s[:, -1]
  167. b[-1, -1] = 0
  168. if w == width and h == height:
  169. bmap = b
  170. else:
  171. bmap = np.zeros((height, width))
  172. for x in range(w):
  173. for y in range(h):
  174. if b[y, x]:
  175. j = 1 + math.floor((y - 1) + height / h)
  176. i = 1 + math.floor((x - 1) + width / h)
  177. bmap[j, i] = 1
  178. return bmap
  179. def get_iou(intersection, pixel_sum):
  180. # handle edge cases without resorting to epsilon
  181. if intersection == pixel_sum:
  182. # both mask and gt have zero pixels in them
  183. assert intersection == 0
  184. return 1
  185. return intersection / (pixel_sum - intersection)
  186. class Evaluator:
  187. def __init__(self, boundary=0.008, name=None, obj_id=None):
  188. # boundary: used in computing boundary F-score
  189. self.boundary = boundary
  190. self.name = name
  191. self.obj_id = obj_id
  192. self.objects_in_gt = set()
  193. self.objects_in_masks = set()
  194. self.object_iou = defaultdict(list)
  195. self.boundary_f = defaultdict(list)
  196. def feed_frame(self, mask: np.ndarray, gt: np.ndarray):
  197. """
  198. Compute and accumulate metrics for a single frame (mask/gt pair)
  199. """
  200. # get all objects in the ground-truth
  201. gt_objects = np.unique(gt)
  202. gt_objects = gt_objects[gt_objects != 0].tolist()
  203. # get all objects in the predicted mask
  204. mask_objects = np.unique(mask)
  205. mask_objects = mask_objects[mask_objects != 0].tolist()
  206. self.objects_in_gt.update(set(gt_objects))
  207. self.objects_in_masks.update(set(mask_objects))
  208. all_objects = self.objects_in_gt.union(self.objects_in_masks)
  209. # boundary disk for boundary F-score. It is the same for all objects.
  210. bound_pix = np.ceil(self.boundary * np.linalg.norm(mask.shape))
  211. boundary_disk = disk(bound_pix)
  212. for obj_idx in all_objects:
  213. obj_mask = mask == obj_idx
  214. obj_gt = gt == obj_idx
  215. # object iou
  216. self.object_iou[obj_idx].append(
  217. get_iou((obj_mask * obj_gt).sum(), obj_mask.sum() + obj_gt.sum())
  218. )
  219. """
  220. # boundary f-score
  221. This part is copied from davis2017-evaluation
  222. """
  223. mask_boundary = _seg2bmap(obj_mask)
  224. gt_boundary = _seg2bmap(obj_gt)
  225. mask_dilated = cv2.dilate(mask_boundary.astype(np.uint8), boundary_disk)
  226. gt_dilated = cv2.dilate(gt_boundary.astype(np.uint8), boundary_disk)
  227. # Get the intersection
  228. gt_match = gt_boundary * mask_dilated
  229. fg_match = mask_boundary * gt_dilated
  230. # Area of the intersection
  231. n_fg = np.sum(mask_boundary)
  232. n_gt = np.sum(gt_boundary)
  233. # Compute precision and recall
  234. if n_fg == 0 and n_gt > 0:
  235. precision = 1
  236. recall = 0
  237. elif n_fg > 0 and n_gt == 0:
  238. precision = 0
  239. recall = 1
  240. elif n_fg == 0 and n_gt == 0:
  241. precision = 1
  242. recall = 1
  243. else:
  244. precision = np.sum(fg_match) / float(n_fg)
  245. recall = np.sum(gt_match) / float(n_gt)
  246. # Compute F measure
  247. if precision + recall == 0:
  248. F = 0
  249. else:
  250. F = 2 * precision * recall / (precision + recall)
  251. self.boundary_f[obj_idx].append(F)
  252. def conclude(self):
  253. all_iou = {}
  254. all_boundary_f = {}
  255. for object_id in self.objects_in_gt:
  256. all_iou[object_id] = np.mean(self.object_iou[object_id]) * 100
  257. all_boundary_f[object_id] = np.mean(self.boundary_f[object_id]) * 100
  258. return all_iou, all_boundary_f
  259. def benchmark(
  260. gt_roots,
  261. mask_roots,
  262. strict=True,
  263. num_processes=None,
  264. *,
  265. verbose=True,
  266. skip_first_and_last=True,
  267. ):
  268. """
  269. gt_roots: a list of paths to datasets, i.e., [path_to_DatasetA, path_to_DatasetB, ...]
  270. mask_roots: same as above, but the .png are masks predicted by the model
  271. strict: when True, all videos in the dataset must have corresponding predictions.
  272. Setting it to False is useful in cases where the ground-truth contains both train/val
  273. sets, but the model only predicts the val subset.
  274. Either way, if a video is predicted (i.e., the corresponding folder exists),
  275. then it must at least contain all the masks in the ground truth annotations.
  276. Masks that are in the prediction but not in the ground-truth
  277. (i.e., sparse annotations) are ignored.
  278. skip_first_and_last: whether we should skip the first and the last frame in evaluation.
  279. This is used by DAVIS 2017 in their semi-supervised evaluation.
  280. It should be disabled for unsupervised evaluation.
  281. """
  282. assert len(gt_roots) == len(mask_roots)
  283. single_dataset = len(gt_roots) == 1
  284. if verbose:
  285. if skip_first_and_last:
  286. print(
  287. "We are *SKIPPING* the evaluation of the first and the last frame (standard for semi-supervised video object segmentation)."
  288. )
  289. else:
  290. print(
  291. "We are *NOT SKIPPING* the evaluation of the first and the last frame (*NOT STANDARD* for semi-supervised video object segmentation)."
  292. )
  293. pool = Pool(num_processes)
  294. start = time.time()
  295. to_wait = []
  296. for gt_root, mask_root in zip(gt_roots, mask_roots):
  297. # Validate folders
  298. validated = True
  299. gt_videos = os.listdir(gt_root)
  300. mask_videos = os.listdir(mask_root)
  301. # if the user passed the root directory instead of Annotations
  302. if len(gt_videos) != len(mask_videos):
  303. if "Annotations" in gt_videos:
  304. if ".png" not in os.listdir(path.join(gt_root, "Annotations"))[0]:
  305. gt_root = path.join(gt_root, "Annotations")
  306. gt_videos = os.listdir(gt_root)
  307. # remove non-folder items
  308. gt_videos = list(filter(lambda x: path.isdir(path.join(gt_root, x)), gt_videos))
  309. mask_videos = list(
  310. filter(lambda x: path.isdir(path.join(mask_root, x)), mask_videos)
  311. )
  312. if not strict:
  313. videos = sorted(list(set(gt_videos) & set(mask_videos)))
  314. else:
  315. gt_extras = set(gt_videos) - set(mask_videos)
  316. mask_extras = set(mask_videos) - set(gt_videos)
  317. if len(gt_extras) > 0:
  318. print(
  319. f"Videos that are in {gt_root} but not in {mask_root}: {gt_extras}"
  320. )
  321. validated = False
  322. if len(mask_extras) > 0:
  323. print(
  324. f"Videos that are in {mask_root} but not in {gt_root}: {mask_extras}"
  325. )
  326. validated = False
  327. if not validated:
  328. print("Validation failed. Exiting.")
  329. exit(1)
  330. videos = sorted(gt_videos)
  331. if verbose:
  332. print(
  333. f"In dataset {gt_root}, we are evaluating on {len(videos)} videos: {videos}"
  334. )
  335. if single_dataset:
  336. if verbose:
  337. results = tqdm.tqdm(
  338. pool.imap(
  339. VideoEvaluator(
  340. gt_root, mask_root, skip_first_and_last=skip_first_and_last
  341. ),
  342. videos,
  343. ),
  344. total=len(videos),
  345. )
  346. else:
  347. results = pool.map(
  348. VideoEvaluator(
  349. gt_root, mask_root, skip_first_and_last=skip_first_and_last
  350. ),
  351. videos,
  352. )
  353. else:
  354. to_wait.append(
  355. pool.map_async(
  356. VideoEvaluator(
  357. gt_root, mask_root, skip_first_and_last=skip_first_and_last
  358. ),
  359. videos,
  360. )
  361. )
  362. pool.close()
  363. all_global_jf, all_global_j, all_global_f = [], [], []
  364. all_object_metrics = []
  365. for i, mask_root in enumerate(mask_roots):
  366. if not single_dataset:
  367. results = to_wait[i].get()
  368. all_iou = []
  369. all_boundary_f = []
  370. object_metrics = {}
  371. for name, iou, boundary_f in results:
  372. all_iou.extend(list(iou.values()))
  373. all_boundary_f.extend(list(boundary_f.values()))
  374. object_metrics[name] = (iou, boundary_f)
  375. global_j = np.array(all_iou).mean()
  376. global_f = np.array(all_boundary_f).mean()
  377. global_jf = (global_j + global_f) / 2
  378. time_taken = time.time() - start
  379. """
  380. Build string for reporting results
  381. """
  382. # find max length for padding
  383. ml = max(*[len(n) for n in object_metrics.keys()], len("Global score"))
  384. # build header
  385. out_string = f'{"sequence":<{ml}},{"obj":>3}, {"J&F":>4}, {"J":>4}, {"F":>4}\n'
  386. out_string += f'{"Global score":<{ml}},{"":>3}, {global_jf:.1f}, {global_j:.1f}, {global_f:.1f}\n'
  387. # append one line for each object
  388. for name, (iou, boundary_f) in object_metrics.items():
  389. for object_idx in iou.keys():
  390. j, f = iou[object_idx], boundary_f[object_idx]
  391. jf = (j + f) / 2
  392. out_string += (
  393. f"{name:<{ml}},{object_idx:03}, {jf:>4.1f}, {j:>4.1f}, {f:>4.1f}\n"
  394. )
  395. # print to console
  396. if verbose:
  397. print(out_string.replace(",", " "), end="")
  398. print("\nSummary:")
  399. print(
  400. f"Global score: J&F: {global_jf:.1f} J: {global_j:.1f} F: {global_f:.1f}"
  401. )
  402. print(f"Time taken: {time_taken:.2f}s")
  403. # print to file
  404. result_path = path.join(mask_root, "results.csv")
  405. print(f"Saving the results to {result_path}")
  406. with open(result_path, "w") as f:
  407. f.write(out_string)
  408. all_global_jf.append(global_jf)
  409. all_global_j.append(global_j)
  410. all_global_f.append(global_f)
  411. all_object_metrics.append(object_metrics)
  412. return all_global_jf, all_global_j, all_global_f, all_object_metrics