ytvis_eval.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
  2. # pyre-unsafe
  3. import copy
  4. import gc
  5. import logging
  6. import os
  7. from collections import defaultdict
  8. from operator import xor
  9. from pathlib import Path
  10. from typing import List, Optional
  11. import numpy as np
  12. import pycocotools.mask as mask_util
  13. import torch
  14. from pycocotools.cocoeval import COCOeval
  15. from sam3.eval.cgf1_eval import CGF1Eval
  16. from sam3.eval.coco_eval_offline import convert_to_xywh
  17. from sam3.model.box_ops import box_xywh_inter_union
  18. from sam3.train.masks_ops import rle_encode
  19. from sam3.train.utils import distributed as dist
  20. from typing_extensions import override
  21. try:
  22. import rapidjson as json
  23. except ModuleNotFoundError:
  24. import json
  25. from iopath.common.file_io import g_pathmgr
  26. class YTVISevalMixin:
  27. """
  28. Identical to COCOeval but adapts computeIoU to compute IoU between tracklets/masklets.
  29. """
  30. @override
  31. def _prepare(self):
  32. """
  33. Copied from cocoeval.py but doesn't convert masks to RLEs (we assume they already are RLEs)
  34. """
  35. p = self.params
  36. if p.useCats:
  37. gts = self.cocoGt.loadAnns(
  38. self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
  39. )
  40. dts = self.cocoDt.loadAnns(
  41. self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)
  42. )
  43. else:
  44. gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
  45. dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
  46. # set ignore flag
  47. for gt in gts:
  48. gt["ignore"] = gt["ignore"] if "ignore" in gt else 0
  49. gt["ignore"] = "iscrowd" in gt and gt["iscrowd"]
  50. if p.iouType == "keypoints":
  51. gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"]
  52. self._gts = defaultdict(list) # gt for evaluation
  53. self._dts = defaultdict(list) # dt for evaluation
  54. for gt in gts:
  55. self._gts[gt["image_id"], gt["category_id"]].append(gt)
  56. for dt in dts:
  57. self._dts[dt["image_id"], dt["category_id"]].append(dt)
  58. self.evalImgs = defaultdict(list) # per-image per-category evaluation results
  59. self.eval = {} # accumulated evaluation results
  60. def computeIoU(self, imgId, catId):
  61. """
  62. Compute IoU between tracklets. Copied from cocoeval.py but adapted for videos (in YT-VIS format)
  63. """
  64. p = self.params
  65. if p.useCats:
  66. gt = self._gts[imgId, catId]
  67. dt = self._dts[imgId, catId]
  68. else:
  69. gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
  70. dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
  71. if len(gt) == 0 or len(dt) == 0:
  72. return []
  73. # For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
  74. # For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
  75. assert hasattr(self, "sort_inds_by_scores_in_iou"), (
  76. "subclasses that inherits YTVISevalMixin should set `self.sort_inds_by_scores_in_iou` "
  77. "(True for class mAP and phrase AP, False for demo F1)"
  78. )
  79. if self.sort_inds_by_scores_in_iou:
  80. inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
  81. dt = [dt[i] for i in inds]
  82. if len(dt) > p.maxDets[-1]:
  83. dt = dt[0 : p.maxDets[-1]]
  84. if p.iouType == "segm":
  85. g = [g["segmentations"] for g in gt]
  86. d = [d["segmentations"] for d in dt]
  87. elif p.iouType == "bbox":
  88. g = [g["bboxes"] for g in gt]
  89. d = [d["bboxes"] for d in dt]
  90. else:
  91. raise Exception("unknown iouType for iou computation")
  92. def iou_tracklets(preds, gts):
  93. preds = torch.tensor(preds)
  94. gts = torch.tensor(gts)
  95. inter, union = box_xywh_inter_union(
  96. preds.unsqueeze(1), gts.unsqueeze(0)
  97. ) # Num preds x Num GTS x Num frames
  98. inter = inter.sum(-1)
  99. union = union.sum(-1)
  100. assert (union > 0).all(), (
  101. "There exists a tracklet with zero GTs across time. This is suspicious"
  102. )
  103. return inter / union
  104. def iou_masklets(preds, gts):
  105. inter = 0
  106. union = 0
  107. for p_i, gt_i in zip(preds, gts):
  108. if p_i and gt_i:
  109. # Compute areas of intersection and union
  110. inter += mask_util.area(
  111. mask_util.merge([p_i, gt_i], intersect=True)
  112. )
  113. union += mask_util.area(
  114. mask_util.merge([p_i, gt_i], intersect=False)
  115. )
  116. elif gt_i:
  117. union += mask_util.area(gt_i)
  118. elif p_i:
  119. union += mask_util.area(p_i)
  120. if union > 0:
  121. iou = inter / union
  122. assert iou >= 0 and iou <= 1, "Encountered an error in IoU computation"
  123. else:
  124. assert np.isclose(inter, 0) and np.isclose(union, 0), (
  125. "Encountered an error in IoU computation"
  126. )
  127. iou = 1
  128. return iou
  129. if p.iouType == "segm":
  130. ious = [[iou_masklets(d_i, g_i) for g_i in g] for d_i in d]
  131. else:
  132. ious = iou_tracklets(d, g)
  133. return np.array(ious)
  134. class YTVISeval(YTVISevalMixin, COCOeval):
  135. # For class mAP and phrase AP evaluation, we sort the detections in descending order of scores (as in COCOeval).
  136. sort_inds_by_scores_in_iou = True
  137. class VideoDemoF1Eval(YTVISevalMixin, CGF1Eval):
  138. # For demo F1 evaluation, we DO NOT sort the detections (but match them with GTs via Hungarian matching).
  139. sort_inds_by_scores_in_iou = False
  140. class YTVISResultsWriter:
  141. """
  142. Gather and dumps predictions in YT-VIS format.
  143. Expected flow of API calls: reset() -> N * update() -> compute_synced()
  144. """
  145. def __init__(
  146. self,
  147. dump_file: str,
  148. postprocessor,
  149. gather_pred_via_filesys=False,
  150. pred_file_evaluators: Optional[List] = None,
  151. save_per_frame_scores: bool = False,
  152. write_eval_metrics_file: bool = True,
  153. eval_metrics_file_suffix: str = ".sam3_eval_metrics",
  154. ):
  155. self.dump_file = dump_file
  156. self.dump = []
  157. self.postprocessor = postprocessor
  158. self.gather_pred_via_filesys = gather_pred_via_filesys
  159. if dist.is_main_process():
  160. dirname = os.path.dirname(self.dump_file)
  161. if not os.path.exists(dirname):
  162. os.makedirs(dirname, exist_ok=True)
  163. logging.info(f"Creating folder: {dirname}")
  164. # the evaluation hooks to be applied to the prediction files
  165. self.pred_file_evaluators = pred_file_evaluators or []
  166. self.save_per_frame_scores = save_per_frame_scores
  167. # in addition to the prediction file, we also write the evaluation metrics
  168. # for easier debugging and analysis (stored in another eval_metrics_file
  169. # so that we can keep the dumped prediction file under YT-VIS format)
  170. self.write_eval_metrics_file = write_eval_metrics_file
  171. if self.write_eval_metrics_file:
  172. self.eval_metrics_file = self.dump_file + eval_metrics_file_suffix
  173. os.makedirs(os.path.dirname(self.eval_metrics_file), exist_ok=True)
  174. def _dump_vid_preds(self, results):
  175. dumped_results = copy.deepcopy(results)
  176. self.dump.extend(dumped_results)
  177. def prepare(self, predictions):
  178. ytvis_results = []
  179. for video_id, prediction in predictions.items():
  180. if len(prediction) == 0:
  181. continue
  182. for k in ["boxes", "scores", "labels"]:
  183. assert k in prediction, (
  184. f"Expected predictions to have `{k}` key, available keys are {prediction.keys()}"
  185. )
  186. if self.save_per_frame_scores:
  187. assert "per_frame_scores" in prediction, (
  188. f"Expected predictions to have `per_frame_scores` key, available keys are {prediction.keys()}"
  189. )
  190. assert xor("masks" in prediction, "masks_rle" in prediction), (
  191. f"Expected predictions to have either `masks` key or `masks_rle` key, available keys are {prediction.keys()}"
  192. )
  193. boxes = prediction["boxes"]
  194. boxes = convert_to_xywh(boxes).tolist()
  195. scores = prediction["scores"].tolist()
  196. labels = prediction["labels"].tolist()
  197. if "masks" in prediction:
  198. masks = prediction["masks"].squeeze(2)
  199. assert masks.ndim == 4, (
  200. "Expected masks to be of shape(N_preds,T_frames,H,W)"
  201. )
  202. areas = [mask.flatten(1).sum(1).tolist() for mask in masks]
  203. rles = [rle_encode(masklet) for masklet in masks]
  204. # memory clean
  205. del masks
  206. del prediction["masks"]
  207. elif "masks_rle" in prediction:
  208. rles = prediction.pop("masks_rle")
  209. areas = [
  210. [0 if rle is None else rle.pop("area") for rle in rles_per_obj]
  211. for rles_per_obj in rles
  212. ]
  213. else:
  214. raise ValueError(
  215. "Expected either `masks` or `masks_rle` key in the predictions."
  216. )
  217. new_results = [
  218. {
  219. "video_id": video_id,
  220. "category_id": track_label,
  221. "bboxes": track_boxes,
  222. "score": track_score,
  223. "segmentations": track_masks,
  224. "areas": track_areas,
  225. }
  226. for (
  227. track_boxes,
  228. track_masks,
  229. track_areas,
  230. track_score,
  231. track_label,
  232. ) in zip(boxes, rles, areas, scores, labels)
  233. ]
  234. # Optionally, save per-frame scores
  235. if self.save_per_frame_scores:
  236. per_frame_scores = prediction["per_frame_scores"].tolist()
  237. for res, track_per_frame_scores in zip(new_results, per_frame_scores):
  238. res["per_frame_scores"] = track_per_frame_scores
  239. ytvis_results.extend(new_results)
  240. return ytvis_results
  241. def set_sync_device(self, device: torch.device):
  242. self._sync_device = device
  243. def update(self, *args, **kwargs):
  244. predictions = self.postprocessor.process_results(*args, **kwargs)
  245. results = self.prepare(predictions)
  246. self._dump_vid_preds(results)
  247. def _dump_preds(self):
  248. if not dist.is_main_process():
  249. self.dump = []
  250. gc.collect()
  251. return
  252. dumped_file = Path(self.dump_file)
  253. logging.info(f"YTVIS evaluator: Dumping predictions to {dumped_file}")
  254. with g_pathmgr.open(str(dumped_file), "w") as f:
  255. json.dump(self.dump, f)
  256. self.dump = []
  257. gc.collect()
  258. return str(dumped_file)
  259. def synchronize_between_processes(self):
  260. logging.info("YT-VIS evaluator: Synchronizing between processes")
  261. dump_dict = self._dedup_pre_gather(self.dump)
  262. if self.gather_pred_via_filesys:
  263. dump_dict_all_gpus = dist.gather_to_rank_0_via_filesys(dump_dict)
  264. else:
  265. dump_dict_all_gpus = dist.all_gather(dump_dict, force_cpu=True)
  266. self.dump = self._dedup_post_gather(dump_dict_all_gpus)
  267. logging.info(f"Gathered all {len(self.dump)} predictions")
  268. def _dedup_pre_gather(self, predictions):
  269. """
  270. Organize the predictions as a dict-of-list using (video_id, category_id) as keys
  271. for deduplication after gathering them across GPUs.
  272. During evaluation, PyTorch data loader under `drop_last: False` would wrap
  273. around the dataset length to be a multiple of world size (GPU num) and duplicate
  274. the remaining batches. This causes the same test sample to appear simultaneously
  275. in multiple GPUs, resulting in duplicated predictions being saved into prediction
  276. files. These duplicates are then counted as false positives under detection mAP
  277. metrics (since a ground truth can be matched with only one prediction).
  278. For example, if there are 4 GPUs and 6 samples [A1, A2, B1, B2, C1, C2], the data
  279. loader (under `drop_last: False`) would load it by wrapping it around like
  280. `[A1, A2, B1, B2, C1, C2, *A1*, *A2*]` to make a multiple of 4 and then split it as
  281. - GPU 0: A1, C1
  282. - GPU 1: A2, C2
  283. - GPU 3: B1, **A1**
  284. - GPU 4: B2, **A2**
  285. (as in DistributedSampler in https://github.com/pytorch/pytorch/blob/521588519da9f4876d90ddd7a17c10d0eca89dc6/torch/utils/data/distributed.py#L116-L124)
  286. so the predictions on A1 and A2 will occur twice in the final gathered outputs
  287. in the prediction file (and counted as false positives). This also affects our
  288. YT-VIS official val evaluation, but to a lesser extent than YT-VIS dev since
  289. the latter is much smaller and more susceptible to false positives.
  290. So we to deduplicate this. The tricky part is that we cannot deduplicate them
  291. simply using video id, given that we are sharding the classes in each video
  292. across multiple batches (with 20 prompts per batch) in our "orig_cats" eval dbs.
  293. The solution is to deduplicate based on (video_id, category_id) tuple as keys.
  294. We organize the predictions as a dict-of-list using (video_id, category_id) as
  295. keys on each GPU, with the list of masklets under this (video_id, category_id)
  296. on this GPU as values. Then, we all-gather this dict-of-list across GPUs and
  297. if a key (video_id, category_id) appears in multiple GPUs, we only take the
  298. prediction masklet list from one GPU.
  299. """
  300. prediction_dict = defaultdict(list)
  301. for p in predictions:
  302. prediction_dict[(p["video_id"], p["category_id"])].append(p)
  303. return prediction_dict
  304. def _dedup_post_gather(self, list_of_prediction_dict):
  305. """
  306. Deduplicate the predictions from all GPUs. See `_dedup_pre_gather` for details.
  307. """
  308. dedup_prediction_dict = {}
  309. duplication_keys = []
  310. for prediction_dict in list_of_prediction_dict:
  311. for k, v in prediction_dict.items():
  312. if k not in dedup_prediction_dict:
  313. dedup_prediction_dict[k] = v
  314. else:
  315. duplication_keys.append(k)
  316. logging.info(
  317. f"skipped {len(duplication_keys)} duplicated predictions in YTVISResultsWriter "
  318. f"with the following (video_id, category_id) tuples: {duplication_keys}"
  319. )
  320. dedup_predictions = sum(dedup_prediction_dict.values(), [])
  321. return dedup_predictions
  322. def compute_synced(
  323. self,
  324. ):
  325. self.synchronize_between_processes()
  326. dumped_file = self._dump_preds()
  327. if not dist.is_main_process():
  328. return {"": 0.0}
  329. # run evaluation hooks on the prediction file
  330. meters = {}
  331. all_video_np_level_results = defaultdict(dict)
  332. for evaluator in self.pred_file_evaluators:
  333. gc.collect()
  334. results, video_np_level_results = evaluator.evaluate(dumped_file)
  335. meters.update(results)
  336. for (video_id, category_id), res in video_np_level_results.items():
  337. all_video_np_level_results[(video_id, category_id)].update(res)
  338. gc.collect()
  339. if self.write_eval_metrics_file:
  340. # convert the nested dict of {(video_id, category_id): per_sample_metric_dict}
  341. # to a list of per-sample metric dicts (with video_id and category_id) for JSON,
  342. # as JSON doesn't allow using tuples like (video_id, category_id) as dict keys
  343. video_np_level_metrics = [
  344. {"video_id": video_id, "category_id": category_id, **res}
  345. for (video_id, category_id), res in all_video_np_level_results.items()
  346. ]
  347. eval_metrics = {
  348. "dataset_level_metrics": meters,
  349. "video_np_level_metrics": video_np_level_metrics,
  350. }
  351. with g_pathmgr.open(self.eval_metrics_file, "w") as f:
  352. json.dump(eval_metrics, f)
  353. logging.info(
  354. f"YTVIS evaluator: Dumped evaluation metrics to {self.eval_metrics_file}"
  355. )
  356. if len(meters) == 0:
  357. meters = {"": 0.0}
  358. return meters
  359. def compute(self):
  360. return {"": 0.0}
  361. def reset(self, *args, **kwargs):
  362. self.dump = []