conversion_util.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
  2. # pyre-unsafe
  3. import json
  4. import os
  5. from collections import defaultdict
  6. from tqdm import tqdm
  7. def convert_ytbvis_to_cocovid_gt(ann_json, save_path=None):
  8. """Convert YouTube VIS dataset to COCO-style video instance segmentation format.
  9. Args:
  10. ann_json (str): Path to YouTube VIS annotation JSON file
  11. save_path (str): path to save converted COCO-style JSON
  12. """
  13. # Initialize COCO structure
  14. VIS = {
  15. "info": {},
  16. "images": [],
  17. "videos": [],
  18. "tracks": [],
  19. "annotations": [],
  20. "categories": [],
  21. "licenses": [],
  22. }
  23. # Load original annotations
  24. official_anns = json.load(open(ann_json))
  25. VIS["categories"] = official_anns["categories"] # Direct copy categories
  26. # Initialize counters
  27. records = dict(img_id=1, ann_id=1)
  28. # Create video-to-annotations mapping
  29. vid_to_anns = defaultdict(list)
  30. for ann in official_anns["annotations"]:
  31. vid_to_anns[ann["video_id"]].append(ann)
  32. # Create tracks directly
  33. VIS["tracks"] = [
  34. {
  35. "id": ann["id"],
  36. "category_id": ann["category_id"],
  37. "video_id": ann["video_id"],
  38. }
  39. for ann in official_anns["annotations"]
  40. ]
  41. # Process videos
  42. for video_info in tqdm(official_anns["videos"]):
  43. # Create video entry
  44. video = {
  45. "id": video_info["id"],
  46. "name": os.path.dirname(video_info["file_names"][0]),
  47. "width": video_info["width"],
  48. "height": video_info["height"],
  49. "length": video_info["length"],
  50. "neg_category_ids": [],
  51. "not_exhaustive_category_ids": [],
  52. }
  53. VIS["videos"].append(video)
  54. # Process frames
  55. num_frames = len(video_info["file_names"])
  56. for frame_idx in range(num_frames):
  57. # Create image entry
  58. image = {
  59. "id": records["img_id"],
  60. "video_id": video_info["id"],
  61. "file_name": video_info["file_names"][frame_idx],
  62. "width": video_info["width"],
  63. "height": video_info["height"],
  64. "frame_index": frame_idx,
  65. "frame_id": frame_idx,
  66. }
  67. VIS["images"].append(image)
  68. # Process annotations for this frame
  69. if video_info["id"] in vid_to_anns:
  70. for ann in vid_to_anns[video_info["id"]]:
  71. bbox = ann["bboxes"][frame_idx]
  72. if bbox is None:
  73. continue
  74. # Create annotation entry
  75. annotation = {
  76. "id": records["ann_id"],
  77. "video_id": video_info["id"],
  78. "image_id": records["img_id"],
  79. "track_id": ann["id"],
  80. "category_id": ann["category_id"],
  81. "bbox": bbox,
  82. "area": ann["areas"][frame_idx],
  83. "segmentation": ann["segmentations"][frame_idx],
  84. "iscrowd": ann["iscrowd"],
  85. }
  86. VIS["annotations"].append(annotation)
  87. records["ann_id"] += 1
  88. records["img_id"] += 1
  89. # Print summary
  90. print(f"Converted {len(VIS['videos'])} videos")
  91. print(f"Converted {len(VIS['images'])} images")
  92. print(f"Created {len(VIS['tracks'])} tracks")
  93. print(f"Created {len(VIS['annotations'])} annotations")
  94. if save_path is None:
  95. return VIS
  96. # Save output
  97. save_dir = os.path.dirname(save_path)
  98. os.makedirs(save_dir, exist_ok=True)
  99. json.dump(VIS, open(save_path, "w"))
  100. return VIS
  101. def convert_ytbvis_to_cocovid_pred(
  102. youtubevis_pred_path: str, converted_dataset_path: str, output_path: str
  103. ) -> None:
  104. """
  105. Convert YouTubeVIS predictions to COCO format with video_id preservation
  106. Args:
  107. youtubevis_pred_path: Path to YouTubeVIS prediction JSON
  108. converted_dataset_path: Path to converted COCO dataset JSON
  109. output_path: Path to save COCO format predictions
  110. """
  111. # Load YouTubeVIS predictions
  112. with open(youtubevis_pred_path) as f:
  113. ytv_predictions = json.load(f)
  114. # Load converted dataset for image ID mapping
  115. with open(converted_dataset_path) as f:
  116. coco_dataset = json.load(f)
  117. # Create (video_id, frame_idx) -> image_id mapping
  118. image_id_map = {
  119. (img["video_id"], img["frame_index"]): img["id"]
  120. for img in coco_dataset["images"]
  121. }
  122. coco_annotations = []
  123. track_id_counter = 1 # Unique track ID generator
  124. for pred in tqdm(ytv_predictions):
  125. video_id = pred["video_id"]
  126. category_id = pred["category_id"]
  127. bboxes = pred["bboxes"]
  128. segmentations = pred.get("segmentations", []) # Get segmentations if available
  129. areas = pred.get("areas", []) # Get areas if available
  130. score = pred["score"]
  131. # Assign unique track ID for this prediction
  132. track_id = track_id_counter
  133. track_id_counter += 1
  134. # Ensure segmentations and areas have the same length as bboxes
  135. if len(segmentations) == 0:
  136. segmentations = [None] * len(bboxes)
  137. if len(areas) == 0:
  138. areas = [None] * len(bboxes)
  139. for frame_idx, (bbox, segmentation, area_from_pred) in enumerate(
  140. zip(bboxes, segmentations, areas)
  141. ):
  142. # Skip frames with missing objects (None or zero bbox)
  143. if bbox is None or all(x == 0 for x in bbox):
  144. continue
  145. # Get corresponding image ID from mapping
  146. image_id = image_id_map.get((video_id, frame_idx))
  147. if image_id is None:
  148. raise RuntimeError(
  149. f"prediction {video_id=}, {frame_idx=} does not match any images in the converted COCO format"
  150. )
  151. # Extract bbox coordinates
  152. x, y, w, h = bbox
  153. # Calculate area - use area from prediction if available, otherwise from bbox
  154. if area_from_pred is not None and area_from_pred > 0:
  155. area = area_from_pred
  156. else:
  157. area = w * h
  158. # Create COCO annotation with video_id
  159. coco_annotation = {
  160. "image_id": int(image_id),
  161. "video_id": video_id, # Added video_id field
  162. "track_id": track_id,
  163. "category_id": category_id,
  164. "bbox": [float(x), float(y), float(w), float(h)],
  165. "area": float(area),
  166. "iscrowd": 0,
  167. "score": float(score),
  168. }
  169. # Add segmentation if available
  170. if segmentation is not None:
  171. coco_annotation["segmentation"] = segmentation
  172. coco_annotations.append(coco_annotation)
  173. # Save output
  174. with open(output_path, "w") as f:
  175. json.dump(coco_annotations, f)
  176. print(f"Converted {len(coco_annotations)} predictions to COCO format with video_id")