| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
- # pyre-unsafe
- import json
- import os
- from collections import defaultdict
- from tqdm import tqdm
- def convert_ytbvis_to_cocovid_gt(ann_json, save_path=None):
- """Convert YouTube VIS dataset to COCO-style video instance segmentation format.
- Args:
- ann_json (str): Path to YouTube VIS annotation JSON file
- save_path (str): path to save converted COCO-style JSON
- """
- # Initialize COCO structure
- VIS = {
- "info": {},
- "images": [],
- "videos": [],
- "tracks": [],
- "annotations": [],
- "categories": [],
- "licenses": [],
- }
- # Load original annotations
- official_anns = json.load(open(ann_json))
- VIS["categories"] = official_anns["categories"] # Direct copy categories
- # Initialize counters
- records = dict(img_id=1, ann_id=1)
- # Create video-to-annotations mapping
- vid_to_anns = defaultdict(list)
- for ann in official_anns["annotations"]:
- vid_to_anns[ann["video_id"]].append(ann)
- # Create tracks directly
- VIS["tracks"] = [
- {
- "id": ann["id"],
- "category_id": ann["category_id"],
- "video_id": ann["video_id"],
- }
- for ann in official_anns["annotations"]
- ]
- # Process videos
- for video_info in tqdm(official_anns["videos"]):
- # Create video entry
- video = {
- "id": video_info["id"],
- "name": os.path.dirname(video_info["file_names"][0]),
- "width": video_info["width"],
- "height": video_info["height"],
- "length": video_info["length"],
- "neg_category_ids": [],
- "not_exhaustive_category_ids": [],
- }
- VIS["videos"].append(video)
- # Process frames
- num_frames = len(video_info["file_names"])
- for frame_idx in range(num_frames):
- # Create image entry
- image = {
- "id": records["img_id"],
- "video_id": video_info["id"],
- "file_name": video_info["file_names"][frame_idx],
- "width": video_info["width"],
- "height": video_info["height"],
- "frame_index": frame_idx,
- "frame_id": frame_idx,
- }
- VIS["images"].append(image)
- # Process annotations for this frame
- if video_info["id"] in vid_to_anns:
- for ann in vid_to_anns[video_info["id"]]:
- bbox = ann["bboxes"][frame_idx]
- if bbox is None:
- continue
- # Create annotation entry
- annotation = {
- "id": records["ann_id"],
- "video_id": video_info["id"],
- "image_id": records["img_id"],
- "track_id": ann["id"],
- "category_id": ann["category_id"],
- "bbox": bbox,
- "area": ann["areas"][frame_idx],
- "segmentation": ann["segmentations"][frame_idx],
- "iscrowd": ann["iscrowd"],
- }
- VIS["annotations"].append(annotation)
- records["ann_id"] += 1
- records["img_id"] += 1
- # Print summary
- print(f"Converted {len(VIS['videos'])} videos")
- print(f"Converted {len(VIS['images'])} images")
- print(f"Created {len(VIS['tracks'])} tracks")
- print(f"Created {len(VIS['annotations'])} annotations")
- if save_path is None:
- return VIS
- # Save output
- save_dir = os.path.dirname(save_path)
- os.makedirs(save_dir, exist_ok=True)
- json.dump(VIS, open(save_path, "w"))
- return VIS
- def convert_ytbvis_to_cocovid_pred(
- youtubevis_pred_path: str, converted_dataset_path: str, output_path: str
- ) -> None:
- """
- Convert YouTubeVIS predictions to COCO format with video_id preservation
- Args:
- youtubevis_pred_path: Path to YouTubeVIS prediction JSON
- converted_dataset_path: Path to converted COCO dataset JSON
- output_path: Path to save COCO format predictions
- """
- # Load YouTubeVIS predictions
- with open(youtubevis_pred_path) as f:
- ytv_predictions = json.load(f)
- # Load converted dataset for image ID mapping
- with open(converted_dataset_path) as f:
- coco_dataset = json.load(f)
- # Create (video_id, frame_idx) -> image_id mapping
- image_id_map = {
- (img["video_id"], img["frame_index"]): img["id"]
- for img in coco_dataset["images"]
- }
- coco_annotations = []
- track_id_counter = 1 # Unique track ID generator
- for pred in tqdm(ytv_predictions):
- video_id = pred["video_id"]
- category_id = pred["category_id"]
- bboxes = pred["bboxes"]
- segmentations = pred.get("segmentations", []) # Get segmentations if available
- areas = pred.get("areas", []) # Get areas if available
- score = pred["score"]
- # Assign unique track ID for this prediction
- track_id = track_id_counter
- track_id_counter += 1
- # Ensure segmentations and areas have the same length as bboxes
- if len(segmentations) == 0:
- segmentations = [None] * len(bboxes)
- if len(areas) == 0:
- areas = [None] * len(bboxes)
- for frame_idx, (bbox, segmentation, area_from_pred) in enumerate(
- zip(bboxes, segmentations, areas)
- ):
- # Skip frames with missing objects (None or zero bbox)
- if bbox is None or all(x == 0 for x in bbox):
- continue
- # Get corresponding image ID from mapping
- image_id = image_id_map.get((video_id, frame_idx))
- if image_id is None:
- raise RuntimeError(
- f"prediction {video_id=}, {frame_idx=} does not match any images in the converted COCO format"
- )
- # Extract bbox coordinates
- x, y, w, h = bbox
- # Calculate area - use area from prediction if available, otherwise from bbox
- if area_from_pred is not None and area_from_pred > 0:
- area = area_from_pred
- else:
- area = w * h
- # Create COCO annotation with video_id
- coco_annotation = {
- "image_id": int(image_id),
- "video_id": video_id, # Added video_id field
- "track_id": track_id,
- "category_id": category_id,
- "bbox": [float(x), float(y), float(w), float(h)],
- "area": float(area),
- "iscrowd": 0,
- "score": float(score),
- }
- # Add segmentation if available
- if segmentation is not None:
- coco_annotation["segmentation"] = segmentation
- coco_annotations.append(coco_annotation)
- # Save output
- with open(output_path, "w") as f:
- json.dump(coco_annotations, f)
- print(f"Converted {len(coco_annotations)} predictions to COCO format with video_id")
|