Bugfix: Fixed error when copying items from different datasets (#10)

NikolaiPetukhov · web-flow · commit aa23661723fa · 2025-08-01T02:32:09.000-03:00
* Refactor image retrieval logic in images_get_list

* Refactor annotation copying and logging for improved clarity and error handling

* Enhance image cloning logic to handle name conflicts with improved resolution options and logging

* Refactor image cloning logic to eliminate duplicates and improve conflict resolution handling
diff --git a/src/api_utils.py b/src/api_utils.py
@@ -73,12 +73,14 @@ def images_get_list(api: sly.Api, dataset_id, image_ids=None):
         ApiField.PATH_ORIGINAL,
         ApiField.CREATED_BY_ID[0][0],
     ]
-    filters = None
-    if image_ids is not None:
-        filters = [{"field": ApiField.ID, "operator": "in", "value": image_ids}]
-    img_infos = api.image.get_list(
-        dataset_id, filters=filters, fields=api_fields, force_metadata_for_links=False
-    )
+    if image_ids is None:
+        img_infos = api.image.get_list(
+            dataset_id, fields=api_fields, force_metadata_for_links=False
+        )
+    else:
+        img_infos = api.image.get_info_by_id_batch(
+            ids=image_ids, fields=api_fields, force_metadata_for_links=False
+        )
     return img_infos
 
 
@@ -92,7 +94,6 @@ def create_dataset(
     created_at: Optional[str] = None,
     updated_at: Optional[str] = None,
     created_by: Optional[int] = None,
-    custom_data: Optional[Dict] = None,
 ):
     effective_name = api.dataset._get_effective_new_name(
         project_id=project_id,
@@ -112,8 +113,6 @@ def create_dataset(
         data[ApiField.UPDATED_AT] = updated_at
     if created_by is not None:
         data[ApiField.CREATED_BY_ID[0][0]] = created_by
-    if custom_data is not None:
-        data[ApiField.CUSTOM_DATA] = custom_data
     try:
         response = api.post(
             "datasets.add",
diff --git a/src/main.py b/src/main.py
@@ -31,7 +31,7 @@
 executor = ThreadPoolExecutor(max_workers=5)
 merged_meta = None
 TASK_ID = None
-cancel_deletion = False # flag to cancel deletion of the source items
+cancel_deletion = False  # flag to cancel deletion of the source items
 
 if sly.is_development():
     api.app.workflow.enable()
@@ -393,6 +393,22 @@ def clone_images_with_annotations(
         image_infos = [info for info in image_infos if info.name not in existing]
         if progress_cb is not None:
             progress_cb(len_before - len(image_infos))
+    src_existing = set()
+    if options[JSONKEYS.CONFLICT_RESOLUTION_MODE] in [
+        JSONKEYS.CONFLICT_SKIP,
+        JSONKEYS.CONFLICT_REPLACE,
+    ]:
+        len_before = len(image_infos)
+        non_duplicate = []
+        for image_info in image_infos:
+            if image_info.name not in src_existing:
+                non_duplicate.append(image_info)
+                src_existing.add(image_info.name)
+        image_infos = non_duplicate
+        if progress_cb is not None:
+            progress_cb(len_before - len(image_infos))
+        if len(image_infos) != len_before:
+            logger.info("Some images were skipped due to name conflicts within source images.")
 
     if len(image_infos) == 0:
         return []
@@ -407,22 +423,29 @@ def _copy_imgs(
         return infos, uploaded
 
     def _copy_anns(src: List[sly.ImageInfo], dst: List[sly.ImageInfo]):
-        try:
-            api.annotation.copy_batch_by_ids(
-                [i.id for i in src],
-                [i.id for i in dst],
-                save_source_date=options[JSONKEYS.PRESERVE_SRC_DATE],
-            )
-        except Exception as e:
-            if "Some users are not members of the destination group" in str(e):
-                raise ValueError(
-                    "Unable to copy annotations. Annotation creator is not a member of the destination team."
-                ) from e
-            else:
-                raise e
+        by_dataset = defaultdict(list)
+        for src_info, dst_info in zip(src, dst):
+            by_dataset[src_info.dataset_id].append((src_info, dst_info))
+        for pairs in by_dataset.values():
+            src_ids = [info[0].id for info in pairs]
+            dst_ids = [info[1].id for info in pairs]
+            try:
+                api.annotation.copy_batch_by_ids(
+                    src_ids,
+                    dst_ids,
+                    save_source_date=options[JSONKEYS.PRESERVE_SRC_DATE],
+                )
+            except Exception as e:
+                if "Some users are not members of the destination group" in str(e):
+                    raise ValueError(
+                        "Unable to copy annotations. Annotation creator is not a member of the destination team."
+                    ) from e
+                else:
+                    raise e
 
         return src, dst
 
+    reserved_names = set(existing.keys())
     to_rename = {}  # {new_name: old_name}
     upload_images_tasks = []
     for src_image_infos_batch in sly.batched(image_infos, UPLOAD_IMAGES_BATCH_SIZE):
@@ -434,12 +457,32 @@ def _copy_anns(src: List[sly.ImageInfo], dst: List[sly.ImageInfo]):
             JSONKEYS.CONFLICT_REPLACE,
         ]:
             for i, name in enumerate(names):
-                if name in existing:
-                    names[i] = (
-                        ".".join(name.split(".")[:-1]) + "_" + now + "." + name.split(".")[-1]
-                    )
+                j = 0
+                if name in reserved_names:
+                    new_name = name
+                    while new_name in reserved_names:
+                        if j == 0:
+                            new_name = (
+                                ".".join(name.split(".")[:-1])
+                                + "_"
+                                + now
+                                + "."
+                                + name.split(".")[-1]
+                            )
+                        else:
+                            new_name = (
+                                ".".join(name.split(".")[:-1])
+                                + "_"
+                                + now
+                                + f"_{j}"
+                                + "."
+                                + name.split(".")[-1]
+                            )
+                        j += 1
+                    names[i] = new_name
                     if options[JSONKEYS.CONFLICT_RESOLUTION_MODE] == JSONKEYS.CONFLICT_REPLACE:
                         to_rename[names[i]] = name
+                    reserved_names.add(new_name)
         upload_images_tasks.append(
             executor.submit(
                 _copy_imgs,
@@ -645,21 +688,20 @@ def _copy_anns(
             sf_idx_to_remove.reverse()
             for idx in sf_idx_to_remove:
                 ann.spatial_figures.pop(idx)
-            run_in_executor(
-                api.volume.figure.download_sf_geometries, mask_ids, mask_paths)
+            run_in_executor(api.volume.figure.download_sf_geometries, mask_ids, mask_paths)
             tasks.append(
                 executor.submit(
                     api.volume.annotation.append, dst_info.id, ann, key_id_map, volume_info=dst_info
                 )
             )
-        
+
         for task in as_completed(tasks):
             task.result()
         progress_masks = tqdm(total=len(mask_paths), desc="Uploading Mask 3D geometries")
         for file in mask_paths:
-            with open(file, 'rb') as f:
+            with open(file, "rb") as f:
                 key = UUID(os.path.basename(f.name))
-                api.volume.figure.upload_sf_geometries([key] , {key:f.read()}, key_id_map)
+                api.volume.figure.upload_sf_geometries([key], {key: f.read()}, key_id_map)
             progress_masks.update(1)
         progress_masks.close()
         if set_csm_warning:
@@ -1057,7 +1099,9 @@ def _create_rec(
                 dataset_info, created_info, conflict_resolution_result=conflict_resolution_result
             )
             if dataset_info.custom_data:
-                run_in_executor(api.dataset.update, created_id, custom_data=dataset_info.custom_data)
+                run_in_executor(
+                    api.dataset.update, created_id, custom_data=dataset_info.custom_data
+                )
             logger.info(
                 "Created Dataset",
                 extra={
@@ -1328,7 +1372,9 @@ def replace_dataset(src_dataset_info: sly.DatasetInfo, dst_dataset_info: sly.Dat
     """Remove src_dataset_info and change name of dst_dataset_info to src_dataset_info.name"""
     api.dataset.update(src_dataset_info.id, name=src_dataset_info.name + "__to_remove")
     api.dataset.remove(src_dataset_info.id)
-    return api.dataset.update(dst_dataset_info.id, name=src_dataset_info.name, custom_data=src_dataset_info.custom_data)
+    return api.dataset.update(
+        dst_dataset_info.id, name=src_dataset_info.name, custom_data=src_dataset_info.custom_data
+    )
 
 
 def run_in_executor(func, *args, **kwargs):
@@ -1370,7 +1416,7 @@ def copy_project_with_replace(
             parent_id=dst_dataset_id,
             created_at=src_project_info.created_at if perserve_date else None,
             updated_at=src_project_info.updated_at if perserve_date else None,
-            created_by=src_project_info.created_by_id if perserve_date else None,     
+            created_by=src_project_info.created_by_id if perserve_date else None,
         )
         existing_datasets = find_children_in_tree(datasets_tree, parent_id=dst_dataset_id)
         created_datasets.append(
@@ -1669,9 +1715,12 @@ def move_project(
             "No datasets created. Skipping deletion", extra={"project_id": src_project_info.id}
         )
         return []
-    
+
     if cancel_deletion:
-        logger.info("The source project will not be removed because some of its entities cannot be moved.", extra={"project_id": src_project_info.id})
+        logger.info(
+            "The source project will not be removed because some of its entities cannot be moved.",
+            extra={"project_id": src_project_info.id},
+        )
     else:
         logger.info("Removing source project", extra={"project_id": src_project_info.id})
         run_in_executor(api.project.remove, src_project_info.id)
@@ -1749,9 +1798,12 @@ def move_datasets_tree(
     if len(datasets_to_remove) == 0:
         logger.info("No datasets to remove", extra={"dataset_id": dst_dataset_id})
         return creted_datasets
-    
+
     if cancel_deletion:
-        logger.info("The source datasets will not be removed because some of its entities cannot be moved.", extra={"dataset_id": dst_dataset_id})
+        logger.info(
+            "The source datasets will not be removed because some of its entities cannot be moved.",
+            extra={"dataset_id": dst_dataset_id},
+        )
     else:
         logger.info(
             "Removing source datasets",
@@ -1829,9 +1881,12 @@ def move_items_to_dataset(
         options=options,
         progress_cb=progress_cb,
         src_infos=item_infos,
-    )    
+    )
     if cancel_deletion or len(created_item_infos) < len(item_infos):
-        logger.info("Some items were not moved. Skipping deletion of source items", extra={"dataset_id": dst_dataset_id})
+        logger.info(
+            "Some items were not moved. Skipping deletion of source items",
+            extra={"dataset_id": dst_dataset_id},
+        )
     else:
         delete_items(item_infos)
     cancel_deletion = False
@@ -2265,7 +2320,9 @@ def transfer_from_dataset(
                 f"Dataset created with ID: {target_dataset.id} and name '{target_dataset.name}'"
             )
             if src_dataset.custom_data:
-                run_in_executor(api.dataset.update, target_dataset.id, custom_data=src_dataset.custom_data)
+                run_in_executor(
+                    api.dataset.update, target_dataset.id, custom_data=src_dataset.custom_data
+                )
                 logger.info(f"Dataset custom data has been updated")
         else:
             raise NotImplementedError(