From 8239fc7fd725e966ca68d47c82809dbf2d67b00f Mon Sep 17 00:00:00 2001
From: xiaopoc <xiaopoc@nvidia.com>
Date: Mon, 2 Jun 2025 11:35:31 -0700
Subject: [PATCH 1/6] Add CMakeLists.txt

---
 CMakeLists.txt | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..99a9700
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,69 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(cuhpx LANGUAGES CXX CUDA)
+
+# C++ and CUDA standard
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Find Python and determine site-packages path
+find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
+execute_process(
+    COMMAND "${Python_EXECUTABLE}" -c "import sysconfig; print(sysconfig.get_paths()['purelib'])"
+    OUTPUT_VARIABLE PYTHON_SITE_PACKAGES
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Add PyTorch CMake path
+list(APPEND CMAKE_PREFIX_PATH "${PYTHON_SITE_PACKAGES}/torch/share/cmake")
+
+# Find required packages
+find_package(Torch REQUIRED)
+find_package(pybind11 REQUIRED)
+find_package(CUDA REQUIRED)
+
+# Optional: Locate libtorch_python.so dynamically
+set(TORCH_PYTHON_LIB "${PYTHON_SITE_PACKAGES}/torch/lib/libtorch_python.so")
+
+# Set CUDA architecture (change if needed)
+set(CMAKE_CUDA_ARCHITECTURES 90)
+list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr --expt-extended-lambda -O3)
+
+# Include paths
+include_directories(
+    ${TORCH_INCLUDE_DIRS}
+    ${CUDA_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/src
+)
+
+# ==================== cuhpx_fft ====================
+set(CUHPX_FFT_SRC
+    src/harmonic_transform/hpx_fft.cpp
+    src/harmonic_transform/hpx_fft_cuda.cu
+)
+
+pybind11_add_module(cuhpx_fft MODULE ${CUHPX_FFT_SRC})
+target_link_libraries(cuhpx_fft PRIVATE ${TORCH_LIBRARIES} ${CUDA_LIBRARIES})
+target_link_libraries(cuhpx_fft PRIVATE ${TORCH_PYTHON_LIB})
+target_compile_definitions(cuhpx_fft PRIVATE ${TORCH_CXX_FLAGS})
+set_target_properties(cuhpx_fft PROPERTIES PREFIX "" LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx)
+
+# ==================== cuhpx_remap ====================
+set(CUHPX_REMAP_SRC
+    src/data_remapping/hpx_remapping.cpp
+    src/data_remapping/hpx_remapping_cuda.cu
+)
+
+pybind11_add_module(cuhpx_remap MODULE ${CUHPX_REMAP_SRC})
+target_link_libraries(cuhpx_remap PRIVATE ${TORCH_LIBRARIES} ${CUDA_LIBRARIES})
+target_link_libraries(cuhpx_remap PRIVATE ${TORCH_PYTHON_LIB})
+target_compile_definitions(cuhpx_remap PRIVATE ${TORCH_CXX_FLAGS})
+set_target_properties(cuhpx_remap PROPERTIES PREFIX "" LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx)
+
+# ==================== Install ====================
+install(TARGETS cuhpx_fft cuhpx_remap DESTINATION cuhpx)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx/data DESTINATION cuhpx FILES_MATCHING PATTERN "*.fits")
+
+message(STATUS "✅ cuhpx build system configured successfully.")
+
+

From 0de9f5d2b31b89b3a68d87b8fd19894d60eb4339 Mon Sep 17 00:00:00 2001
From: xiaopoc <xiaopoc@nvidia.com>
Date: Mon, 2 Jun 2025 11:42:11 -0700
Subject: [PATCH 2/6] Clean up

---
 src/data_remapping/hpx_remapping.cpp     | 946 +++++++++++------------
 src/data_remapping/hpx_remapping.h       |   4 +-
 src/data_remapping/hpx_remapping_cuda.cu | 627 ++-------------
 src/harmonic_transform/hpx_fft.cpp       |   5 -
 4 files changed, 513 insertions(+), 1069 deletions(-)

diff --git a/src/data_remapping/hpx_remapping.cpp b/src/data_remapping/hpx_remapping.cpp
index fa136d0..aaff451 100644
--- a/src/data_remapping/hpx_remapping.cpp
+++ b/src/data_remapping/hpx_remapping.cpp
@@ -1,478 +1,468 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <torch/torch.h>
-#include "hpx_remapping.h"
-
-torch::Tensor ring2nest(torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_nest = torch::empty_like(data_in_ring);
-
-    ring2nest_dispatch(data_in_ring, data_in_nest, nside, num_elements);
-
-    return data_in_nest;
-}
-
-
-torch::Tensor nest2ring(torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_ring = torch::empty_like(data_in_nest);
-
-    nest2ring_dispatch(data_in_nest, data_in_ring, nside, num_elements);
-
-    return data_in_ring;
-}
-
-torch::Tensor nest2xy(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_xy = torch::empty_like(data_in_nest);
-
-    int src_origin = 0;
-    int dest_origin;
-
-    bool src_clockwise = false;
-    bool dest_clockwise = clockwise;
-
-    if (origin == "S"){
-        dest_origin = 0;
-    } else if (origin == "E"){
-        dest_origin = 1;
-    } else if (origin == "N"){
-        dest_origin = 2;
-    } else if (origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    nest2xy_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_xy;
-}
-
-torch::Tensor xy2nest(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_nest = torch::empty_like(data_in_xy);
-
-
-    int src_origin;
-    int dest_origin = 0;
-
-    bool src_clockwise = clockwise;
-    bool dest_clockwise = false;
-
-    if (origin == "S"){
-        src_origin = 0;
-    } else if (origin == "E"){
-        src_origin = 1;
-    } else if (origin == "N"){
-        src_origin = 2;
-    } else if (origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2nest_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_nest;
-}
-
-torch::Tensor ring2xy(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_xy = torch::empty_like(data_in_ring);
-
-    int src_origin = 0;
-    int dest_origin;
-
-    bool src_clockwise = false;
-    bool dest_clockwise = clockwise;
-
-    if (origin == "S"){
-        dest_origin = 0;
-    } else if (origin == "E"){
-        dest_origin = 1;
-    } else if (origin == "N"){
-        dest_origin = 2;
-    } else if (origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    ring2xy_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_xy;
-}
-
-torch::Tensor xy2ring(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_ring = torch::empty_like(data_in_xy);
-
-    int src_origin;
-    int dest_origin = 0;
-
-    bool src_clockwise = clockwise;
-    bool dest_clockwise = false;
-
-    if (origin == "S"){
-        src_origin = 0;
-    } else if (origin == "E"){
-        src_origin = 1;
-    } else if (origin == "N"){
-        src_origin = 2;
-    } else if (origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2ring_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_ring;
-}
-
-torch::Tensor xy2xy(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_xy_out = torch::empty_like(data_xy_in);
-
-    int src_origin;
-    int dest_origin;
-
-
-    if (s_origin == "S"){
-        src_origin = 0;
-    } else if (s_origin == "E"){
-        src_origin = 1;
-    } else if (s_origin == "N"){
-        src_origin = 2;
-    } else if (s_origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop");
-    }
-
-    if (d_origin == "S"){
-        dest_origin = 0;
-    } else if (d_origin == "E"){
-        dest_origin = 1;
-    } else if (d_origin == "N"){
-        dest_origin = 2;
-    } else if (d_origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2xy_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_xy_out;
-}
-
-void benchmark_nest_ring(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    benchmark_nest_ring_dispatch(data_in_nest, data_in_ring, nside, num_elements);
-}
-
-
-torch::Tensor ring2nest_batch(torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_nest = torch::empty_like(data_in_ring);
-
-    ring2nest_batch_dispatch(data_in_ring, data_in_nest, nside, num_elements);
-
-    return data_in_nest;
-}
-
-
-torch::Tensor nest2ring_batch(torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_ring = torch::empty_like(data_in_nest);
-
-    nest2ring_batch_dispatch(data_in_nest, data_in_ring, nside, num_elements);
-
-    return data_in_ring;
-}
-
-torch::Tensor nest2xy_batch(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_xy = torch::empty_like(data_in_nest);
-
-    int src_origin = 0;
-    int dest_origin;
-
-    bool src_clockwise = false;
-    bool dest_clockwise = clockwise;
-
-    if (origin == "S"){
-        dest_origin = 0;
-    } else if (origin == "E"){
-        dest_origin = 1;
-    } else if (origin == "N"){
-        dest_origin = 2;
-    } else if (origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    nest2xy_batch_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_xy;
-}
-
-torch::Tensor xy2nest_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_nest = torch::empty_like(data_in_xy);
-
-
-    int src_origin;
-    int dest_origin = 0;
-
-    bool src_clockwise = clockwise;
-    bool dest_clockwise = false;
-
-    if (origin == "S"){
-        src_origin = 0;
-    } else if (origin == "E"){
-        src_origin = 1;
-    } else if (origin == "N"){
-        src_origin = 2;
-    } else if (origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2nest_batch_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_nest;
-}
-
-torch::Tensor ring2xy_batch(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_xy = torch::empty_like(data_in_ring);
-
-    int src_origin = 0;
-    int dest_origin;
-
-    bool src_clockwise = false;
-    bool dest_clockwise = clockwise;
-
-    if (origin == "S"){
-        dest_origin = 0;
-    } else if (origin == "E"){
-        dest_origin = 1;
-    } else if (origin == "N"){
-        dest_origin = 2;
-    } else if (origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    ring2xy_batch_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_xy;
-}
-
-torch::Tensor xy2ring_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_ring = torch::empty_like(data_in_xy);
-
-    int src_origin;
-    int dest_origin = 0;
-
-    bool src_clockwise = clockwise;
-    bool dest_clockwise = false;
-
-    if (origin == "S"){
-        src_origin = 0;
-    } else if (origin == "E"){
-        src_origin = 1;
-    } else if (origin == "N"){
-        src_origin = 2;
-    } else if (origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2ring_batch_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_ring;
-}
-
-torch::Tensor xy2xy_batch(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_xy_out = torch::empty_like(data_xy_in);
-
-    int src_origin;
-    int dest_origin;
-
-
-    if (s_origin == "S"){
-        src_origin = 0;
-    } else if (s_origin == "E"){
-        src_origin = 1;
-    } else if (s_origin == "N"){
-        src_origin = 2;
-    } else if (s_origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop");
-    }
-
-    if (d_origin == "S"){
-        dest_origin = 0;
-    } else if (d_origin == "E"){
-        dest_origin = 1;
-    } else if (d_origin == "N"){
-        dest_origin = 2;
-    } else if (d_origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2xy_batch_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_xy_out;
-}
-
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-
-    m.def("ring2nest", &ring2nest, "Convert ring to nest (CUDA)");
-    m.def("nest2ring", &nest2ring, "Convert nest to ring (CUDA)");
-
-    m.def("nest2xy", &nest2xy, "Convert nest to xy (CUDA)");
-    m.def("xy2nest", &xy2nest, "Convert xy to nest (CUDA)");
-
-    m.def("ring2xy", &ring2xy, "Convert ring to xy (CUDA)");
-    m.def("xy2ring", &xy2ring, "Convert xy to ring (CUDA)");
-
-    m.def("xy2xy", &xy2xy, "Convert xy to xy (CUDA)");
-
-    m.def("benchmark_nest_ring", &benchmark_nest_ring, "Benchmark nest and ring (CUDA)");
-
-    m.def("ring2nest_batch", &ring2nest_batch, "Convert ring to nest (CUDA) in batch");
-    m.def("nest2ring_batch", &nest2ring_batch, "Convert nest to ring (CUDA) in batch");
-
-    m.def("nest2xy_batch", &nest2xy_batch, "Convert nest to xy (CUDA) in batch");
-    m.def("xy2nest_batch", &xy2nest_batch, "Convert xy to nest (CUDA) in batch");
-
-    m.def("ring2xy_batch", &ring2xy_batch, "Convert ring to xy (CUDA) in batch");
-    m.def("xy2ring_batch", &xy2ring_batch, "Convert xy to ring (CUDA) in batch");
-
-    m.def("xy2xy_batch", &xy2xy_batch, "Convert xy to xy (CUDA) in batch");
-
-}
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/extension.h> 
+#include <torch/torch.h>
+#include "hpx_remapping.h"
+
+torch::Tensor ring2nest(torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_nest = torch::empty_like(data_in_ring);
+
+    ring2nest_dispatch(data_in_ring, data_in_nest, nside, num_elements);
+
+    return data_in_nest;
+}
+
+
+torch::Tensor nest2ring(torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_ring = torch::empty_like(data_in_nest);
+
+    nest2ring_dispatch(data_in_nest, data_in_ring, nside, num_elements);
+
+    return data_in_ring;
+}
+
+torch::Tensor nest2xy(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, 
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_xy = torch::empty_like(data_in_nest);
+
+    int src_origin = 0;
+    int dest_origin;
+
+    bool src_clockwise = false;
+    bool dest_clockwise = clockwise;
+
+    if (origin == "S"){
+        dest_origin = 0;
+    } else if (origin == "E"){
+        dest_origin = 1;
+    } else if (origin == "N"){
+        dest_origin = 2;
+    } else if (origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    nest2xy_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_xy;
+}
+
+torch::Tensor xy2nest(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, 
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_nest = torch::empty_like(data_in_xy);
+
+
+    int src_origin;
+    int dest_origin = 0;
+
+    bool src_clockwise = clockwise;
+    bool dest_clockwise = false;
+
+    if (origin == "S"){
+        src_origin = 0;
+    } else if (origin == "E"){
+        src_origin = 1;
+    } else if (origin == "N"){
+        src_origin = 2;
+    } else if (origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2nest_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_nest;
+}
+
+torch::Tensor ring2xy(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, 
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_xy = torch::empty_like(data_in_ring);
+
+    int src_origin = 0;
+    int dest_origin;
+
+    bool src_clockwise = false;
+    bool dest_clockwise = clockwise;
+
+    if (origin == "S"){
+        dest_origin = 0;
+    } else if (origin == "E"){
+        dest_origin = 1;
+    } else if (origin == "N"){
+        dest_origin = 2;
+    } else if (origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    ring2xy_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_xy;
+}
+
+torch::Tensor xy2ring(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, 
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_ring = torch::empty_like(data_in_xy);
+
+    int src_origin;
+    int dest_origin = 0;
+
+    bool src_clockwise = clockwise;
+    bool dest_clockwise = false;
+
+    if (origin == "S"){
+        src_origin = 0;
+    } else if (origin == "E"){
+        src_origin = 1;
+    } else if (origin == "N"){
+        src_origin = 2;
+    } else if (origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2ring_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_ring;
+}
+
+torch::Tensor xy2xy(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_xy_out = torch::empty_like(data_xy_in);
+
+    int src_origin;
+    int dest_origin;
+
+
+    if (s_origin == "S"){
+        src_origin = 0;
+    } else if (s_origin == "E"){
+        src_origin = 1;
+    } else if (s_origin == "N"){
+        src_origin = 2;
+    } else if (s_origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop");
+    }
+
+    if (d_origin == "S"){
+        dest_origin = 0;
+    } else if (d_origin == "E"){
+        dest_origin = 1;
+    } else if (d_origin == "N"){
+        dest_origin = 2;
+    } else if (d_origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2xy_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_xy_out;
+}
+
+
+torch::Tensor ring2nest_batch(torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_nest = torch::empty_like(data_in_ring);
+
+    ring2nest_batch_dispatch(data_in_ring, data_in_nest, nside, num_elements);
+
+    return data_in_nest;
+}
+
+
+torch::Tensor nest2ring_batch(torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_ring = torch::empty_like(data_in_nest);
+
+    nest2ring_batch_dispatch(data_in_nest, data_in_ring, nside, num_elements);
+
+    return data_in_ring;
+}
+
+torch::Tensor nest2xy_batch(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, 
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_xy = torch::empty_like(data_in_nest);
+
+    int src_origin = 0;
+    int dest_origin;
+
+    bool src_clockwise = false;
+    bool dest_clockwise = clockwise;
+
+    if (origin == "S"){
+        dest_origin = 0;
+    } else if (origin == "E"){
+        dest_origin = 1;
+    } else if (origin == "N"){
+        dest_origin = 2;
+    } else if (origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    nest2xy_batch_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_xy;
+}
+
+torch::Tensor xy2nest_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, 
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_nest = torch::empty_like(data_in_xy);
+
+
+    int src_origin;
+    int dest_origin = 0;
+
+    bool src_clockwise = clockwise;
+    bool dest_clockwise = false;
+
+    if (origin == "S"){
+        src_origin = 0;
+    } else if (origin == "E"){
+        src_origin = 1;
+    } else if (origin == "N"){
+        src_origin = 2;
+    } else if (origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2nest_batch_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_nest;
+}
+
+torch::Tensor ring2xy_batch(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, 
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_xy = torch::empty_like(data_in_ring);
+
+    int src_origin = 0;
+    int dest_origin;
+
+    bool src_clockwise = false;
+    bool dest_clockwise = clockwise;
+
+    if (origin == "S"){
+        dest_origin = 0;
+    } else if (origin == "E"){
+        dest_origin = 1;
+    } else if (origin == "N"){
+        dest_origin = 2;
+    } else if (origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    ring2xy_batch_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_xy;
+}
+
+torch::Tensor xy2ring_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, 
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_ring = torch::empty_like(data_in_xy);
+
+    int src_origin;
+    int dest_origin = 0;
+
+    bool src_clockwise = clockwise;
+    bool dest_clockwise = false;
+
+    if (origin == "S"){
+        src_origin = 0;
+    } else if (origin == "E"){
+        src_origin = 1;
+    } else if (origin == "N"){
+        src_origin = 2;
+    } else if (origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2ring_batch_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_ring;
+}
+
+torch::Tensor xy2xy_batch(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_xy_out = torch::empty_like(data_xy_in);
+
+    int src_origin;
+    int dest_origin;
+
+
+    if (s_origin == "S"){
+        src_origin = 0;
+    } else if (s_origin == "E"){
+        src_origin = 1;
+    } else if (s_origin == "N"){
+        src_origin = 2;
+    } else if (s_origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop");
+    }
+
+    if (d_origin == "S"){
+        dest_origin = 0;
+    } else if (d_origin == "E"){
+        dest_origin = 1;
+    } else if (d_origin == "N"){
+        dest_origin = 2;
+    } else if (d_origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2xy_batch_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_xy_out;
+}
+
+
+
+PYBIND11_MODULE(cuhpx_remap, m) {
+
+    m.def("ring2nest", &ring2nest, "Convert ring to nest (CUDA)");
+    m.def("nest2ring", &nest2ring, "Convert nest to ring (CUDA)");
+
+    m.def("nest2xy", &nest2xy, "Convert nest to xy (CUDA)");
+    m.def("xy2nest", &xy2nest, "Convert xy to nest (CUDA)");
+
+    m.def("ring2xy", &ring2xy, "Convert ring to xy (CUDA)");
+    m.def("xy2ring", &xy2ring, "Convert xy to ring (CUDA)");
+
+    m.def("xy2xy", &xy2xy, "Convert xy to xy (CUDA)");
+
+    m.def("ring2nest_batch", &ring2nest_batch, "Convert ring to nest (CUDA) in batch");
+    m.def("nest2ring_batch", &nest2ring_batch, "Convert nest to ring (CUDA) in batch");
+
+    m.def("nest2xy_batch", &nest2xy_batch, "Convert nest to xy (CUDA) in batch");
+    m.def("xy2nest_batch", &xy2nest_batch, "Convert xy to nest (CUDA) in batch");
+
+    m.def("ring2xy_batch", &ring2xy_batch, "Convert ring to xy (CUDA) in batch");
+    m.def("xy2ring_batch", &xy2ring_batch, "Convert xy to ring (CUDA) in batch");
+
+    m.def("xy2xy_batch", &xy2xy_batch, "Convert xy to xy (CUDA) in batch");
+
+}
+
diff --git a/src/data_remapping/hpx_remapping.h b/src/data_remapping/hpx_remapping.h
index 2edc77d..5ed936a 100644
--- a/src/data_remapping/hpx_remapping.h
+++ b/src/data_remapping/hpx_remapping.h
@@ -43,9 +43,6 @@ void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, cons
 void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise,
                     const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements);
 
-void benchmark_nest_ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements);
-
-
 void ring2nest_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements);
 
 void nest2ring_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements);
@@ -66,3 +63,4 @@ void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, c
                     const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements);
 
 #endif // HPX_REMAPPING_H
+
diff --git a/src/data_remapping/hpx_remapping_cuda.cu b/src/data_remapping/hpx_remapping_cuda.cu
index 41cf8f0..eb0f677 100644
--- a/src/data_remapping/hpx_remapping_cuda.cu
+++ b/src/data_remapping/hpx_remapping_cuda.cu
@@ -133,15 +133,15 @@ __device__ inline void correct_xy_orient(int& ix, int& iy, const int order, cons
     iy = new_iy;
 }
 
-template <typename I, typename T>
+template <typename I, typename T> 
 __global__ void rearrange_data_kernel_naive(const T* d_data_in, T* d_data_out, const I* d_pix_array, const size_t num_elements){
-
+    
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (tid < num_elements){
         I pix = d_pix_array[tid];
         d_data_out[pix] = d_data_in[tid];
-
+        
     }
 }
 
@@ -157,7 +157,7 @@ template <typename I>
 }
 
 
-template <typename I>
+template <typename I> 
 __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array, int* face_num_array,
     const int order, const I nside, const size_t num_elements){
 
@@ -192,10 +192,10 @@ __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array
             I irm = 2*nside + 1 - tmp;
             I ifm = iphi - (ire >> 1) + nside - 1;
             I ifp = iphi - (irm >> 1) + nside - 1;
-
+ 
             ifm >>= order;
             ifp >>= order;
-
+     
             face_num = (ifp == ifm) ? (ifp | 4) : ((ifp < ifm) ? ifp : (ifm + 8));
         } else { // South Polar Cap
             I ip = npix - pix;
@@ -224,8 +224,8 @@ __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array
 }
 
 
-template<typename I>
-__global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array,
+template<typename I> 
+__global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array, 
         const int order, const I nside, const size_t num_elements) {
 
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -281,7 +281,7 @@ __global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy
     }
 }
 
-template <typename I>
+template <typename I> 
 __global__ void nest2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array, int* face_num_array,
     const int order, const I nside, const size_t num_elements) {
 
@@ -303,7 +303,7 @@ __global__ void nest2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array
     }
 }
 
-template <typename I>
+template <typename I> 
 __global__ void xyf2nest_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array,
     const int order, const I nside, const size_t num_elements){
 
@@ -601,7 +601,7 @@ void ring2nest_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_
     ring2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
     xyf2nest_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
     rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, num_elements);
-
+    
     //cudaDeviceSynchronize();
 
     cudaFree(d_pix_array);
@@ -633,7 +633,7 @@ void nest2ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_
     nest2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
     xyf2ring_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
     rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, num_elements);
-
+    
     //cudaDeviceSynchronize();
 
     cudaFree(d_pix_array);
@@ -643,170 +643,6 @@ void nest2ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_
 }
 
 
-template <typename T>
-void benchmark_nest_ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
-
-    int block_size = 1024;
-    int num_blocks = (num_elements + block_size - 1) / block_size;
-
-    int order = compute_order(nside);
-
-    int* d_pix_array;
-    int* d_ix_array;
-    int* d_iy_array;
-    int* d_face_num_array;
-
-    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
-    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
-
-
-    float total_elapsed_time_r2n = 0.0f;
-    float total_elapsed_time_n2r = 0.0f;
-
-    float total_elapsed_time_n2xy = 0.0f;
-    float total_elapsed_time_xy2n = 0.0f;
-
-    float elapsed_time;
-
-
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    int num_runs = 3;
-    int k = 2;
-    bool flip = false;
-
-    // warm-up run
-    for (int i = 0; i < num_runs; ++i)
-    {
-        // nest 2 ring
-        initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
-        nest2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        xyf2ring_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, num_elements);
-
-        // ring 2 nest
-        initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
-        ring2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        xyf2nest_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, num_elements);
-
-        // nest2xy
-        initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
-        nest2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        xyf2flat_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
-        rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, num_elements);
-
-        // xy2nest
-        initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
-        flat2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
-        xyf2nest_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, num_elements);
-    }
-
-    cudaDeviceSynchronize();
-
-    num_runs = 10;
-
-
-    for (int i = 0; i < num_runs; ++i)
-    {
-
-        // N2R
-        cudaEventRecord(start, stream);
-
-        initialize_pix_array<<<num_blocks, block_size, 0, stream>>>(d_pix_array, num_elements);
-        nest2xyf_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        xyf2ring_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        rearrange_data_kernel_naive <<<num_blocks, block_size, 0, stream>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, num_elements);
-
-        cudaEventRecord(stop, stream);
-        cudaEventSynchronize(stop);
-
-        elapsed_time = 0.0f;
-        cudaEventElapsedTime(&elapsed_time, start, stop);
-        total_elapsed_time_n2r += elapsed_time;
-
-        // R2N
-
-        cudaEventRecord(start, stream);
-
-        initialize_pix_array<<<num_blocks, block_size, 0, stream>>>(d_pix_array, num_elements);
-        ring2xyf_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        xyf2nest_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        rearrange_data_kernel_naive <<<num_blocks, block_size, 0, stream>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, num_elements);
-
-        cudaEventRecord(stop, stream);
-        cudaEventSynchronize(stop);
-
-        elapsed_time = 0.0f;
-        cudaEventElapsedTime(&elapsed_time, start, stop);
-        total_elapsed_time_r2n += elapsed_time;
-
-        // nest2xy
-        cudaEventRecord(start, stream);
-
-        initialize_pix_array<<<num_blocks, block_size, 0, stream>>>(d_pix_array, num_elements);
-        nest2xyf_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        xyf2flat_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
-        rearrange_data_kernel_naive <<<num_blocks, block_size, 0, stream>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, num_elements);
-
-        cudaEventRecord(stop, stream);
-        cudaEventSynchronize(stop);
-
-        elapsed_time = 0.0f;
-        cudaEventElapsedTime(&elapsed_time, start, stop);
-        total_elapsed_time_n2xy += elapsed_time;
-
-        // xy2nest
-        cudaEventRecord(start, stream);
-
-        initialize_pix_array<<<num_blocks, block_size, 0, stream>>>(d_pix_array, num_elements);
-        flat2xyf_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
-        xyf2nest_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-        rearrange_data_kernel_naive <<<num_blocks, block_size, 0, stream>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, num_elements);
-
-        cudaEventRecord(stop, stream);
-        cudaEventSynchronize(stop);
-
-        elapsed_time = 0.0f;
-        cudaEventElapsedTime(&elapsed_time, start, stop);
-        total_elapsed_time_xy2n += elapsed_time;
-
-    }
-
-    float average_elapsed_time_n2r = total_elapsed_time_n2r / num_runs;
-    float average_elapsed_time_r2n = total_elapsed_time_r2n / num_runs;
-    float average_elapsed_time_n2xy = total_elapsed_time_n2xy / num_runs;
-    float average_elapsed_time_xy2n = total_elapsed_time_xy2n / num_runs;
-
-
-    std::cout << "Average elapsed time used by CUDA data ring2nest kernel over " << num_runs << " runs: " << average_elapsed_time_r2n << "ms" << std::endl;
-    std::cout << "Average elapsed time used by CUDA data nest2ring kernel over " << num_runs << " runs: " << average_elapsed_time_n2r << "ms" << std::endl;
-
-    std::cout << "Average elapsed time used by CUDA data nest2XY kernel over " << num_runs << " runs: " << average_elapsed_time_n2xy << "ms" << std::endl;
-    std::cout << "Average elapsed time used by CUDA data XY2nest kernel over " << num_runs << " runs: " << average_elapsed_time_xy2n << "ms" << std::endl;
-
-    cudaEventDestroy(start);
-    cudaEventDestroy(stop);
-    cudaStreamDestroy(stream);
-
-    cudaDeviceSynchronize();
-
-
-    cudaFree(d_pix_array);
-    cudaFree(d_ix_array);
-    cudaFree(d_iy_array);
-    cudaFree(d_face_num_array);
-}
-
-
 // Dispatch functions for each wrapper
 void ring2nest_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
 
@@ -825,11 +661,11 @@ void nest2ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring,
 
 void nest2xy_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise,
                       const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) {
-
+    
     AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "nest2xy", ([&] {
         nest2xy_kernel_wrapper<scalar_t>(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-
+    
 }
 
 void xy2nest_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_nest, const int src_origin, const bool src_clockwise,
@@ -847,7 +683,7 @@ void ring2xy_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_xy, cons
     AT_DISPATCH_ALL_TYPES(data_in_ring.scalar_type(), "ring2xy", ([&] {
         ring2xy_kernel_wrapper<scalar_t>(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-
+    
 }
 
 void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, const int src_origin, const bool src_clockwise,
@@ -856,7 +692,7 @@ void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, cons
     AT_DISPATCH_ALL_TYPES(data_in_xy.scalar_type(), "xy2ring", ([&] {
         xy2ring_kernel_wrapper<scalar_t>(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-
+    
 }
 
 void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise,
@@ -865,61 +701,51 @@ void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const i
     AT_DISPATCH_ALL_TYPES(data_xy_in.scalar_type(), "xy2xy", ([&] {
         xy2xy_kernel_wrapper<scalar_t>(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-
+    
 }
 
-void benchmark_nest_ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
-
-    AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "benchmark_nest_ring", ([&] {
-        benchmark_nest_ring_kernel_wrapper<scalar_t>(data_in_nest, data_in_ring, nside, num_elements);
-    }));
-
-}
-
-
-
 
 template <typename I, typename T>
-__global__ void rearrange_data_kernel_3d_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array,
+__global__ void rearrange_data_kernel_3d_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, 
         const size_t m, const size_t n, const size_t num_elements) {
-
+    
     // d_data_in: m by n by num_elements 3D matrix
 
     int idz = blockIdx.z * blockDim.z + threadIdx.z;
     int idy = blockIdx.y * blockDim.y + threadIdx.y;
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
+    
     int tid = idz * n * num_elements + idy * num_elements + idx;
-
+    
     if (idz < m && idy < n && idx < num_elements) {
 
-        I pix = d_pix_array[idx];
+        I pix = d_pix_array[idx];             
 
         d_data_out[tid-idx + pix] = d_data_in[tid];
     }
 }
 
 template <typename I, typename T>
-__global__ void rearrange_data_kernel_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array,
+__global__ void rearrange_data_kernel_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, 
         const size_t n, const size_t num_elements) {
 
     unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
+    
     unsigned int tid = idy * num_elements + idx;
-
+    
     if (idy < n && idx < num_elements) {
 
-        I pix = d_pix_array[idx];
+        I pix = d_pix_array[idx];             
 
         d_data_out[tid-idx + pix] = d_data_in[tid];
     }
 }
 
 template <typename T>
-void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out,
+void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out, 
                           const int src_origin, const bool src_clockwise,
-                          const int dest_origin, const bool dest_clockwise,
+                          const int dest_origin, const bool dest_clockwise, 
                           const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -965,9 +791,9 @@ void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_
 
 
 template <typename T>
-void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy,
+void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy, 
                             const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise,
+                            const int dest_origin, const bool dest_clockwise, 
                             const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -1022,9 +848,9 @@ void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data
 }
 
 template <typename T>
-void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest,
+void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest, 
                             const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise,
+                            const int dest_origin, const bool dest_clockwise, 
                             const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -1079,9 +905,9 @@ void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_i
 }
 
 template <typename T>
-void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy,
+void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy, 
                             const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise,
+                            const int dest_origin, const bool dest_clockwise, 
                             const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -1137,9 +963,9 @@ void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data
 
 
 template <typename T>
-void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring,
+void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring, 
                             const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise,
+                            const int dest_origin, const bool dest_clockwise, 
                             const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -1195,7 +1021,7 @@ void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_i
 
 
 template <typename T>
-void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest,
+void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest, 
                               const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -1244,7 +1070,7 @@ void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor da
 }
 
 template <typename T>
-void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring,
+void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, 
                               const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -1292,372 +1118,6 @@ void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor da
     cudaFree(d_face_num_array);
 }
 
-/*
-template <typename T>
-void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out,
-                          const int src_origin, const bool src_clockwise,
-                          const int dest_origin, const bool dest_clockwise,
-                          const int nside, const size_t num_elements) {
-
-    // Check tensor dimensions and sizes
-    TORCH_CHECK(data_xy_in.dim() == 3, "data_xy_in must be a 3D tensor");
-    TORCH_CHECK(data_xy_out.dim() == 3, "data_xy_out must be a 3D tensor");
-
-    int m = data_xy_in.size(0); // First dimension
-    int n = data_xy_in.size(1); // Second dimension
-
-    int order = compute_order(nside);
-
-    int block_size = 256;
-    int num_blocks = (num_elements + block_size - 1)/ block_size;
-
-    // Calculate block and grid dimensions
-    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
-    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
-                  (n + block_dim.y - 1) / block_dim.y,
-                  (m + block_dim.z - 1) / block_dim.z);
-
-    // Allocate device memory
-    int* d_pix_array;
-
-    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
-
-    // Compute rotation parameters
-    bool flip = src_clockwise != dest_clockwise;
-    int rotations = dest_origin - src_origin;
-    int k = (dest_clockwise ? -rotations : rotations) & 3;
-    if (k < 0) { k += 4; }
-
-    // Launch kernels
-    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
-    flat2flat_kernel<<<num_blocks, block_size>>>(d_pix_array, order, nside, num_elements, flip, k);
-
-    // Adjust kernel launch for 3D data
-    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_xy_in.data_ptr<T>(), data_xy_out.data_ptr<T>(), d_pix_array, m, n, num_elements);
-
-    // Free device memory
-    cudaFree(d_pix_array);
-}
-
-
-template <typename T>
-void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy,
-                            const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise,
-                            const int nside, const size_t num_elements) {
-
-    // Check tensor dimensions and sizes
-    TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest should be a 3D tensor");
-    TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy should be a 3D tensor");
-
-    int m = data_in_nest.size(0); // First dimension
-    int n = data_in_nest.size(1); // Second dimension
-
-    int order = compute_order(nside);
-
-    // Calculate block and grid dimensions
-    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
-    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
-                  (n + block_dim.y - 1) / block_dim.y,
-                  (m + block_dim.z - 1) / block_dim.z);
-
-    int block_size = 256;
-    int num_blocks = (num_elements + block_size - 1)/ block_size;
-
-    // Allocate device memory
-    int* d_pix_array;
-    int* d_ix_array;
-    int* d_iy_array;
-    int* d_face_num_array;
-
-    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
-    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
-
-    // Compute rotation parameters
-    bool flip = src_clockwise != dest_clockwise;
-    int rotations = dest_origin - src_origin;
-    int k = (dest_clockwise ? -rotations : rotations) & 3;
-    if (k < 0) { k += 4; }
-
-    // Launch kernels
-    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
-    nest2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-    xyf2flat_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
-
-    // Adjust kernel launch for 3D data
-    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_nest.data_ptr<T>(), data_in_xy.data_ptr<T>(), d_pix_array, m, n, num_elements);
-
-    // Free device memory
-    cudaFree(d_pix_array);
-    cudaFree(d_ix_array);
-    cudaFree(d_iy_array);
-    cudaFree(d_face_num_array);
-}
-
-template <typename T>
-void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest,
-                            const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise,
-                            const int nside, const size_t num_elements) {
-
-    // Check tensor dimensions and sizes
-    TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor");
-    TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor");
-
-    int m = data_in_xy.size(0); // First dimension
-    int n = data_in_xy.size(1); // Second dimension
-
-    int order = compute_order(nside);
-
-    int block_size = 256;
-    int num_blocks = (num_elements + block_size - 1)/ block_size;
-
-    // Calculate block and grid dimensions
-    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
-    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
-                  (n + block_dim.y - 1) / block_dim.y,
-                  (m + block_dim.z - 1) / block_dim.z);
-
-    // Allocate device memory
-    int* d_pix_array;
-    int* d_ix_array;
-    int* d_iy_array;
-    int* d_face_num_array;
-
-    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
-    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
-
-    // Compute rotation parameters
-    bool flip = src_clockwise != dest_clockwise;
-    int rotations = dest_origin - src_origin;
-    int k = (dest_clockwise ? -rotations : rotations) & 3;
-    if (k < 0) { k += 4; }
-
-    // Launch kernels
-    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
-    flat2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
-    xyf2nest_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-
-    // Adjust kernel launch for 3D data
-    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_xy.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, m, n, num_elements);
-
-    // Free device memory
-    cudaFree(d_pix_array);
-    cudaFree(d_ix_array);
-    cudaFree(d_iy_array);
-    cudaFree(d_face_num_array);
-}
-
-template <typename T>
-void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy,
-                            const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise,
-                            const int nside, const size_t num_elements) {
-
-    // Check tensor dimensions and sizes
-    TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor");
-    TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor");
-
-    int m = data_in_ring.size(0); // First dimension
-    int n = data_in_ring.size(1); // Second dimension
-    int order = compute_order(nside);
-
-    int block_size = 256;
-    int num_blocks = (num_elements + block_size - 1)/ block_size;
-
-    // Calculate block and grid dimensions
-    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
-    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
-                  (n + block_dim.y - 1) / block_dim.y,
-                  (m + block_dim.z - 1) / block_dim.z);
-
-    // Allocate device memory
-    int* d_pix_array;
-    int* d_ix_array;
-    int* d_iy_array;
-    int* d_face_num_array;
-
-    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
-    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
-
-    // Compute rotation parameters
-    bool flip = src_clockwise != dest_clockwise;
-    int rotations = dest_origin - src_origin;
-    int k = (dest_clockwise ? -rotations : rotations) & 3;
-    if (k < 0) { k += 4; }
-
-    // Launch kernels
-    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
-    ring2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-    xyf2flat_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
-
-    // Adjust kernel launch for 3D data
-    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_ring.data_ptr<T>(), data_in_xy.data_ptr<T>(), d_pix_array, m, n, num_elements);
-
-    // Free device memory
-    cudaFree(d_pix_array);
-    cudaFree(d_ix_array);
-    cudaFree(d_iy_array);
-    cudaFree(d_face_num_array);
-}
-
-
-template <typename T>
-void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring,
-                            const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise,
-                            const int nside, const size_t num_elements) {
-
-    // Check tensor dimensions and sizes
-    TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor");
-    TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor");
-
-    int m = data_in_xy.size(0); // First dimension
-    int n = data_in_xy.size(1); // Second dimension
-    int order = compute_order(nside);
-
-    int block_size = 256;
-    int num_blocks = (num_elements + block_size - 1)/ block_size;
-
-    // Calculate block and grid dimensions
-    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
-    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
-                  (n + block_dim.y - 1) / block_dim.y,
-                  (m + block_dim.z - 1) / block_dim.z);
-
-    // Allocate device memory
-    int* d_pix_array;
-    int* d_ix_array;
-    int* d_iy_array;
-    int* d_face_num_array;
-
-    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
-    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
-
-    // Compute rotation parameters
-    bool flip = src_clockwise != dest_clockwise;
-    int rotations = dest_origin - src_origin;
-    int k = (dest_clockwise ? -rotations : rotations) & 3;
-    if (k < 0) { k += 4; }
-
-    // Launch kernels
-    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
-    flat2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
-    xyf2ring_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-
-    // Adjust kernel launch for 3D data
-    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_xy.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, m, n, num_elements);
-
-    // Free device memory
-    cudaFree(d_pix_array);
-    cudaFree(d_ix_array);
-    cudaFree(d_iy_array);
-    cudaFree(d_face_num_array);
-}
-
-
-template <typename T>
-void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest,
-                              const int nside, const size_t num_elements) {
-
-    // Check tensor dimensions and sizes
-    TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor");
-    TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor");
-
-    int m = data_in_ring.size(0); // First dimension
-    int n = data_in_ring.size(1); // Second dimension
-    int order = compute_order(nside);
-
-    int block_size = 256;
-    int num_blocks = (num_elements + block_size - 1)/ block_size;
-
-    // Calculate block and grid dimensions
-    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
-    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
-                  (n + block_dim.y - 1) / block_dim.y,
-                  (m + block_dim.z - 1) / block_dim.z);
-
-    // Allocate device memory
-    int* d_pix_array;
-    int* d_ix_array;
-    int* d_iy_array;
-    int* d_face_num_array;
-
-    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
-    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
-
-    // Launch kernels
-    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
-    ring2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-    xyf2nest_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-
-    // Adjust kernel launch for 3D data
-    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, m, n, num_elements);
-
-    // Free device memory
-    cudaFree(d_pix_array);
-    cudaFree(d_ix_array);
-    cudaFree(d_iy_array);
-    cudaFree(d_face_num_array);
-}
-
-template <typename T>
-void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring,
-                              const int nside, const size_t num_elements) {
-
-    // Check tensor dimensions and sizes
-    TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor");
-    TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor");
-
-    int m = data_in_nest.size(0); // First dimension
-    int n = data_in_nest.size(1); // Second dimension
-    int order = compute_order(nside);
-
-    int block_size = 256;
-    int num_blocks = (num_elements + block_size - 1)/ block_size;
-
-    // Calculate block and grid dimensions
-    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
-    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
-                  (n + block_dim.y - 1) / block_dim.y,
-                  (m + block_dim.z - 1) / block_dim.z);
-
-    // Allocate device memory
-    int* d_pix_array;
-    int* d_ix_array;
-    int* d_iy_array;
-    int* d_face_num_array;
-
-    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
-    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
-    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
-
-    // Launch kernels
-    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
-    nest2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-    xyf2ring_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
-
-    // Adjust kernel launch for 3D data
-    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, m, n, num_elements);
-
-    // Free device memory
-    cudaFree(d_pix_array);
-    cudaFree(d_ix_array);
-    cudaFree(d_iy_array);
-    cudaFree(d_face_num_array);
-}
-*/
 
 // Dispatch functions for each wrapper
 void ring2nest_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
@@ -1677,11 +1137,11 @@ void nest2ring_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_
 
 void nest2xy_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise,
                       const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) {
-
+    
     AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "nest2xy_batch", ([&] {
         nest2xy_batch_kernel_wrapper<scalar_t>(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-
+    
 }
 
 void xy2nest_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_nest, const int src_origin, const bool src_clockwise,
@@ -1699,7 +1159,7 @@ void ring2xy_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_xy
     AT_DISPATCH_ALL_TYPES(data_in_ring.scalar_type(), "ring2xy_batch", ([&] {
         ring2xy_batch_kernel_wrapper<scalar_t>(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-
+    
 }
 
 void xy2ring_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, const int src_origin, const bool src_clockwise,
@@ -1708,7 +1168,7 @@ void xy2ring_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring
     AT_DISPATCH_ALL_TYPES(data_in_xy.scalar_type(), "xy2ring_batch", ([&] {
         xy2ring_batch_kernel_wrapper<scalar_t>(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-
+    
 }
 
 void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise,
@@ -1717,5 +1177,6 @@ void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, c
     AT_DISPATCH_ALL_TYPES(data_xy_in.scalar_type(), "xy2xy_batch", ([&] {
         xy2xy_batch_kernel_wrapper<scalar_t>(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-
+    
 }
+
diff --git a/src/harmonic_transform/hpx_fft.cpp b/src/harmonic_transform/hpx_fft.cpp
index 90c8af1..8531caf 100644
--- a/src/harmonic_transform/hpx_fft.cpp
+++ b/src/harmonic_transform/hpx_fft.cpp
@@ -421,11 +421,6 @@ torch::Tensor healpix_irfft_cuda(torch::Tensor ftm, int L, int nside) {
 }
 
 
-// PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-//     m.def("healpix_rfft_cuda", &healpix_rfft_cuda, "HEALPix RFFT with cuFFT and CUDA");
-//     m.def("healpix_irfft_cuda", &healpix_irfft_cuda, "HEALPix IRFFT with cuFFT and CUDA");
-// }
-
 int nphi_ring(int t, int nside);
 int cumulative_nphi_ring(int t, int nside);
 double p2phi_ring(int t, int offset, int nside);

From beb7519ea4f1d34728d08a621e083807c099c04d Mon Sep 17 00:00:00 2001
From: xiaopoc <xiaopoc@nvidia.com>
Date: Mon, 2 Jun 2025 11:52:17 -0700
Subject: [PATCH 3/6] Clean up

---
 cuhpx/hpx_remap.py |  3 +--
 cuhpx/hpx_sht.py   | 31 +------------------------------
 2 files changed, 2 insertions(+), 32 deletions(-)

diff --git a/cuhpx/hpx_remap.py b/cuhpx/hpx_remap.py
index a1adb05..0bd6612 100644
--- a/cuhpx/hpx_remap.py
+++ b/cuhpx/hpx_remap.py
@@ -13,9 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cuhpx_remap
 import torch
-
+from . import cuhpx_remap
 
 def is_power_of_two(n):
     return (n > 0) and (n & (n - 1)) == 0
diff --git a/cuhpx/hpx_sht.py b/cuhpx/hpx_sht.py
index bb77bb2..eec5ce1 100644
--- a/cuhpx/hpx_sht.py
+++ b/cuhpx/hpx_sht.py
@@ -13,10 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import cuhpx_fft
+from . import cuhpx_fft
 import numpy as np
 import torch
-import torch.cuda.nvtx
 import torch.nn as nn
 from torch.autograd import Function
 
@@ -449,13 +448,10 @@ def einsum_with_chunking(x, weights, mmax, xout, nchunk, stream1):
     device = torch.device("cuda")
     chunk_size = int(weights.size(1) / nchunk + 1)  # Adjust this based on your memory constraints
 
-    torch.cuda.nvtx.range_push("Allocate memory for chunk")
     next_chunk_cpu = torch.empty((weights.size(0), chunk_size, weights.size(2)), dtype=weights.dtype, pin_memory=True)
     current_chunk = torch.empty((weights.size(0), chunk_size, weights.size(2)), dtype=weights.dtype, device=device)
     next_chunk = torch.empty_like(current_chunk)
-    torch.cuda.nvtx.range_pop()
 
-    torch.cuda.nvtx.range_push("einsum between x and weights with chunking")
 
     # Create events for synchronization
     event_transfer = torch.cuda.Event(blocking=True)
@@ -475,40 +471,30 @@ def einsum_with_chunking(x, weights, mmax, xout, nchunk, stream1):
         if actual_chunk_size != chunk_size:
             next_chunk_cpu.resize_((weights.size(0), actual_chunk_size, weights.size(2)))
 
-        torch.cuda.nvtx.range_push("CPU copy from weights to pin memory")
         next_chunk_cpu.copy_(weights[:, start_i:end_i, :])
-        torch.cuda.nvtx.range_pop()
 
         with torch.cuda.stream(stream1):
-            torch.cuda.nvtx.range_push(f"Transfer weights chunk {i}:{end_i} to GPU")
             next_chunk[: weights.size(0), : end_i - start_i, :].copy_(next_chunk_cpu, non_blocking=True)
             event_transfer.record(stream1)
-            torch.cuda.nvtx.range_pop()
 
-        torch.cuda.nvtx.range_push(f"Compute einsum for chunk {i - chunk_size}:{end_i - chunk_size}")
         xout[..., start_j:end_j, :, :] = torch.einsum(
             '...kmn,mlk->...lmn', x, current_chunk[:, : end_j - start_j, :].to(x.dtype)
         )
 
         event_computation.record(torch.cuda.current_stream())
-        torch.cuda.nvtx.range_pop()
         torch.cuda.current_stream().wait_event(event_transfer)
 
         current_chunk, next_chunk = next_chunk, current_chunk
         start_j, end_j = start_i, end_i
 
     if start_i < weights.size(1):
-        torch.cuda.nvtx.range_push("Compute einsum for the last chunk")
         xout[..., start_i:end_i, :, :] = torch.einsum(
             '...kmn,mlk->...lmn', x, current_chunk[:, : end_i - start_i, :].to(x.dtype)
         )
-        torch.cuda.nvtx.range_pop()
 
     stream1.synchronize()
     torch.cuda.current_stream().synchronize()
 
-    torch.cuda.nvtx.range_pop()  # End of einsum with chunking
-
     return xout
 
 
@@ -523,39 +509,28 @@ def forward(ctx, x, weights, pct, W, mmax, lmax, nside):
         ctx.lmax = lmax
         ctx.nside = nside
 
-        torch.cuda.nvtx.range_push("rfft")
         # SHT
         if x.dim() == 1:
             x = cuhpx_fft.healpix_rfft_class(x, mmax, nside)
         else:
             x = cuhpx_fft.healpix_rfft_batch(x, mmax, nside)
 
-        torch.cuda.nvtx.range_pop()
-
         x = torch.view_as_real(x)
 
         out_shape = list(x.size())
         out_shape[-3] = lmax
         out_shape[-2] = mmax
 
-        torch.cuda.nvtx.range_push("allocate xout")
         xout = torch.zeros(out_shape, dtype=x.dtype, device=x.device)
-        torch.cuda.nvtx.range_pop()
 
-        torch.cuda.nvtx.range_push("einsum between pct and weights")
         weights = pct * weights
-        torch.cuda.nvtx.range_pop()
 
         if not pct.is_cuda:
-            torch.cuda.nvtx.range_push("einsum between x and weights using two stream")
             nchunk = 12
             stream1 = torch.cuda.Stream()
             xout = einsum_with_chunking(x, weights, mmax, xout, nchunk, stream1)
-            torch.cuda.nvtx.range_pop()
         else:
-            torch.cuda.nvtx.range_push("einsum between x and weights")
             xout = torch.einsum('...kmn,mlk->...lmn', x, weights.to(x.dtype))
-            torch.cuda.nvtx.range_pop()
 
         x = torch.view_as_complex(xout.contiguous())
 
@@ -595,18 +570,14 @@ def forward(ctx, x, weights, pct, W, mmax, lmax, nside):
 
         x = torch.view_as_real(x)
 
-        torch.cuda.nvtx.range_push("einsum between x and pct")
         xs = torch.einsum('...lmn, mlk->...kmn', x, pct.to(x.dtype))
-        torch.cuda.nvtx.range_pop()
 
         x = torch.view_as_complex(xs.contiguous())
 
-        torch.cuda.nvtx.range_push("irfft")
         if x.dim() == 2:
             x = cuhpx_fft.healpix_irfft_class(x, mmax, nside)
         else:
             x = cuhpx_fft.healpix_irfft_batch(x, mmax, nside)
-        torch.cuda.nvtx.range_pop()
 
         return x
 

From 5680410c77675703a8f50894873b0cce40fc387f Mon Sep 17 00:00:00 2001
From: root <root@ipp1-2318.nvidia.com>
Date: Tue, 5 Aug 2025 01:16:40 +0000
Subject: [PATCH 4/6] fix bug for build tools

---
 src/harmonic_transform/hpx_fft.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/harmonic_transform/hpx_fft.cpp b/src/harmonic_transform/hpx_fft.cpp
index 8531caf..ddacfd2 100644
--- a/src/harmonic_transform/hpx_fft.cpp
+++ b/src/harmonic_transform/hpx_fft.cpp
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+#include <torch/extension.h> 
 #include "hpx_fft.h"
 #include <cmath>
 
@@ -666,7 +667,7 @@ torch::Tensor healpix_irfft(torch::Tensor ftm, int L, int nside) {
     return f;
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+PYBIND11_MODULE(cuhpx_fft, m) {
     m.def("healpix_rfft", &healpix_rfft, "HEALPix RFFT");
     m.def("healpix_irfft", &healpix_irfft, "HEALPix IRFFT");
     m.def("healpix_rfft_cufft", &healpix_rfft_cufft, "HEALPix RFFT with cuFFT");

From acb92bc2d9ca77c5dfb47f820988b4fc458258ca Mon Sep 17 00:00:00 2001
From: xiaopoc <xiaopoc@nvidia.com>
Date: Mon, 11 Aug 2025 16:47:18 -0700
Subject: [PATCH 5/6] add CMakeLists.txt

---
 CMakeLists.txt                           | 111 ++-
 cuhpx/hpx_remap.py                       |   3 +-
 cuhpx/hpx_sht.py                         |   2 +
 pyproject.toml                           |  39 +-
 setup.py                                 |  55 --
 src/data_remapping/hpx_remapping.cpp     | 946 ++++++++++++-----------
 src/data_remapping/hpx_remapping.h       |   4 +-
 src/data_remapping/hpx_remapping_cuda.cu | 627 +++++++++++++--
 src/harmonic_transform/hpx_fft.cpp       |   6 +-
 9 files changed, 1174 insertions(+), 619 deletions(-)
 delete mode 100644 setup.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 99a9700..55d0226 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,69 +1,106 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 project(cuhpx LANGUAGES CXX CUDA)
 
-# C++ and CUDA standard
+# ---------- Basics ----------
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+# LTO/IPO can drop CUDA fatbins; keep it off for these extensions
+set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)
 
-# Find Python and determine site-packages path
+# CUDA archs default; override via -DCMAKE_CUDA_ARCHITECTURES=80;86;90
+if(NOT CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "OFF")
+  set(CMAKE_CUDA_ARCHITECTURES 90)  # Hopper default
+endif()
+message(STATUS "✅ cuhpx build configured. CUDA archs: ${CMAKE_CUDA_ARCHITECTURES}")
+
+# ---------- Python / Torch ----------
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module)
+
+# Find site-packages so we can locate Torch's CMake configs
 execute_process(
-    COMMAND "${Python_EXECUTABLE}" -c "import sysconfig; print(sysconfig.get_paths()['purelib'])"
-    OUTPUT_VARIABLE PYTHON_SITE_PACKAGES
-    OUTPUT_STRIP_TRAILING_WHITESPACE
+  COMMAND "${Python_EXECUTABLE}" -c "import sysconfig; print(sysconfig.get_paths()['purelib'])"
+  OUTPUT_VARIABLE PYTHON_SITE_PACKAGES
+  OUTPUT_STRIP_TRAILING_WHITESPACE
 )
-
-# Add PyTorch CMake path
 list(APPEND CMAKE_PREFIX_PATH "${PYTHON_SITE_PACKAGES}/torch/share/cmake")
 
-# Find required packages
-find_package(Torch REQUIRED)
+find_package(Torch REQUIRED)        # ${TORCH_LIBRARIES}, ${TORCH_CXX_FLAGS}
 find_package(pybind11 REQUIRED)
-find_package(CUDA REQUIRED)
 
-# Optional: Locate libtorch_python.so dynamically
-set(TORCH_PYTHON_LIB "${PYTHON_SITE_PACKAGES}/torch/lib/libtorch_python.so")
+# ---------- CUDA toolchain (modern imported targets) ----------
+find_package(CUDAToolkit REQUIRED)  # provides CUDA::cudart, etc.
 
-# Set CUDA architecture (change if needed)
-set(CMAKE_CUDA_ARCHITECTURES 90)
-list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr --expt-extended-lambda -O3)
+# Optional: Torch's Python shim (resolves at::Tensor pybind casters)
+find_library(TORCH_PYTHON_LIBRARY
+  NAMES torch_python
+  HINTS "${PYTHON_SITE_PACKAGES}/torch/lib" "${TORCH_INSTALL_PREFIX}/lib"
+)
+if(TORCH_PYTHON_LIBRARY)
+  message(STATUS "Found torch_python at: ${TORCH_PYTHON_LIBRARY}")
+endif()
 
-# Include paths
 include_directories(
-    ${TORCH_INCLUDE_DIRS}
-    ${CUDA_INCLUDE_DIRS}
-    ${CMAKE_CURRENT_SOURCE_DIR}/src
+  ${TORCH_INCLUDE_DIRS}
+  ${CMAKE_CURRENT_SOURCE_DIR}/src
 )
 
+# ---------- Helper: apply safe CUDA flags per target ----------
+function(cuhpx_apply_cuda_flags target_name)
+  target_compile_options(${target_name} PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+    $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xcudafe>
+    $<$<COMPILE_LANGUAGE:CUDA>:--diag_suppress=20014>
+  )
+  set_target_properties(${target_name} PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    INTERPROCEDURAL_OPTIMIZATION FALSE
+  )
+endfunction()
+
 # ==================== cuhpx_fft ====================
 set(CUHPX_FFT_SRC
-    src/harmonic_transform/hpx_fft.cpp
-    src/harmonic_transform/hpx_fft_cuda.cu
+  src/harmonic_transform/hpx_fft.cpp
+  src/harmonic_transform/hpx_fft_cuda.cu
 )
-
 pybind11_add_module(cuhpx_fft MODULE ${CUHPX_FFT_SRC})
-target_link_libraries(cuhpx_fft PRIVATE ${TORCH_LIBRARIES} ${CUDA_LIBRARIES})
-target_link_libraries(cuhpx_fft PRIVATE ${TORCH_PYTHON_LIB})
 target_compile_definitions(cuhpx_fft PRIVATE ${TORCH_CXX_FLAGS})
-set_target_properties(cuhpx_fft PROPERTIES PREFIX "" LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx)
+target_link_libraries(cuhpx_fft PRIVATE
+  ${TORCH_LIBRARIES}
+  CUDA::cudart
+  Python::Module
+)
+if(TORCH_PYTHON_LIBRARY)
+  target_link_libraries(cuhpx_fft PRIVATE ${TORCH_PYTHON_LIBRARY})
+endif()
+set_target_properties(cuhpx_fft PROPERTIES PREFIX "" OUTPUT_NAME "cuhpx_fft")
+cuhpx_apply_cuda_flags(cuhpx_fft)
 
 # ==================== cuhpx_remap ====================
 set(CUHPX_REMAP_SRC
-    src/data_remapping/hpx_remapping.cpp
-    src/data_remapping/hpx_remapping_cuda.cu
+  src/data_remapping/hpx_remapping.cpp
+  src/data_remapping/hpx_remapping_cuda.cu
 )
-
 pybind11_add_module(cuhpx_remap MODULE ${CUHPX_REMAP_SRC})
-target_link_libraries(cuhpx_remap PRIVATE ${TORCH_LIBRARIES} ${CUDA_LIBRARIES})
-target_link_libraries(cuhpx_remap PRIVATE ${TORCH_PYTHON_LIB})
 target_compile_definitions(cuhpx_remap PRIVATE ${TORCH_CXX_FLAGS})
-set_target_properties(cuhpx_remap PROPERTIES PREFIX "" LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx)
-
-# ==================== Install ====================
-install(TARGETS cuhpx_fft cuhpx_remap DESTINATION cuhpx)
-install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx/data DESTINATION cuhpx FILES_MATCHING PATTERN "*.fits")
-
-message(STATUS "✅ cuhpx build system configured successfully.")
+target_link_libraries(cuhpx_remap PRIVATE
+  ${TORCH_LIBRARIES}
+  CUDA::cudart
+  Python::Module
+)
+if(TORCH_PYTHON_LIBRARY)
+  target_link_libraries(cuhpx_remap PRIVATE ${TORCH_PYTHON_LIBRARY})
+endif()
+set_target_properties(cuhpx_remap PROPERTIES PREFIX "" OUTPUT_NAME "cuhpx_remap")
+cuhpx_apply_cuda_flags(cuhpx_remap)
 
+# ---------- Install layout for wheels ----------
+# scikit-build-core will install these into site-packages/cuhpx/ for wheels.
+install(TARGETS cuhpx_fft cuhpx_remap LIBRARY DESTINATION cuhpx)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx/data
+  DESTINATION cuhpx
+  FILES_MATCHING PATTERN "*.fits"
+)
 
diff --git a/cuhpx/hpx_remap.py b/cuhpx/hpx_remap.py
index 0bd6612..4090424 100644
--- a/cuhpx/hpx_remap.py
+++ b/cuhpx/hpx_remap.py
@@ -13,8 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 from . import cuhpx_remap
+import torch
+
 
 def is_power_of_two(n):
     return (n > 0) and (n & (n - 1)) == 0
diff --git a/cuhpx/hpx_sht.py b/cuhpx/hpx_sht.py
index eec5ce1..89629d8 100644
--- a/cuhpx/hpx_sht.py
+++ b/cuhpx/hpx_sht.py
@@ -495,6 +495,7 @@ def einsum_with_chunking(x, weights, mmax, xout, nchunk, stream1):
     stream1.synchronize()
     torch.cuda.current_stream().synchronize()
 
+
     return xout
 
 
@@ -515,6 +516,7 @@ def forward(ctx, x, weights, pct, W, mmax, lmax, nside):
         else:
             x = cuhpx_fft.healpix_rfft_batch(x, mmax, nside)
 
+
         x = torch.view_as_real(x)
 
         out_shape = list(x.size())
diff --git a/pyproject.toml b/pyproject.toml
index e4454d1..326b0fa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,21 +1,24 @@
 [build-system]
-requires = ["torch", "setuptools"]
-build-backend = "setuptools.build_meta"
+requires = ["scikit-build-core>=0.8", "pybind11>=2.12", "numpy"]
+build-backend = "scikit_build_core.build"
 
 [project]
 name = "cuHPX"
-version = "2025.5.1"
-description = "GPU-accelerated utilities for data on HEALPix grids."
-readme = "README.md"
-license = { file="LICENSE.txt" }
+version = "2025.8.1"
+description = "CUDA-accelerated HEALPix tools for harmonic transforms and remapping"
 authors = [
-    { name = "NVIDIA", email = "asubramaniam@nvidia.com" }
+  { name = "Xiaopo Cheng", email = "xiaopoc@nvidia.com" },
+  { name = "Akshay Subramaniam", email = "asubramaniam@nvidia.com" }
 ]
+readme = "README.md"
+license = { file = "LICENSE.txt" }
+requires-python = ">=3.8"
 dependencies = [
-    "numpy",
-    "torch>=2.0.0",
-    "astropy",
-    "torch_harmonics",
+  "numpy",
+  "astropy",
+  "torch_harmonics"
+  # Expect PyTorch preinstalled in the environment; if you want to enforce:
+  # "torch>=2.4",
 ]
 classifiers = [
     "Development Status :: 2 - Pre-Alpha",
@@ -32,6 +35,18 @@ classifiers = [
 [project.urls]
 "Homepage" = "https://github.com/NVlabs/cuHPX"
 
+[tool.scikit-build]
+wheel.packages = ["cuhpx"]
+cmake.minimum-version = "3.18"
+cmake.source-dir = "."
+# Keep build artifacts in build/, not alongside sources
+build-dir = "build/{wheel_tag}"
+sdist.include = ["src", "cuhpx", "tests", "CMakeLists.txt", "LICENSE.txt", "README.md"]
+
+[tool.scikit-build.editable]
+# Key: makes editable installs load extensions from build/ via a redirect hook
+mode = "redirect"
+verbose = true
 
 [tool.black]
 line-length = 120
@@ -115,4 +130,4 @@ target-version = 'py38'
     "S101", # asserts allowed in tests...
     "ARG", # Unused function args -> fixtures nevertheless are functionally relevant...
     "FBT", # Don't care about booleans as positional arguments in tests, e.g. via @pytest.mark.parametrize()
-]
+]
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 4465e59..0000000
--- a/setup.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from setuptools import find_packages, setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-setup(
-    name='cuhpx',
-    version='0.1.0',
-    packages=find_packages(),
-    package_data={'cuhpx': ['**/*.fits']},
-    url='https://gitlab-master.nvidia.com/Devtech-Compute/cuhpx',
-    license='TBD',
-    author='Xiaopo Cheng, Akshay Subramaniam',
-    author_email='xiaopoc@nvidia.com, asubramaniam@nvidia.com',
-    description='A library for performing transformations and analysis on HEALPix',
-    install_requires=[
-        'numpy',
-        'torch',
-        'astropy',
-        'torch_harmonics',
-    ],
-    ext_modules=[
-        CUDAExtension(
-            'cuhpx_remap',
-            sources=[
-                'src/data_remapping/hpx_remapping.cpp',
-                'src/data_remapping/hpx_remapping_cuda.cu',
-            ],
-            extra_compile_args={'nvcc': ['-O2']},
-        ),
-        CUDAExtension(
-            'cuhpx_fft',
-            sources=[
-                'src/harmonic_transform/hpx_fft.cpp',
-                'src/harmonic_transform/hpx_fft_cuda.cu',
-            ],
-            extra_compile_args={'nvcc': ['-O2', '-lineinfo']},
-        ),
-    ],
-    cmdclass={'build_ext': BuildExtension},
-)
diff --git a/src/data_remapping/hpx_remapping.cpp b/src/data_remapping/hpx_remapping.cpp
index aaff451..632d1e1 100644
--- a/src/data_remapping/hpx_remapping.cpp
+++ b/src/data_remapping/hpx_remapping.cpp
@@ -1,468 +1,478 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <torch/extension.h> 
-#include <torch/torch.h>
-#include "hpx_remapping.h"
-
-torch::Tensor ring2nest(torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_nest = torch::empty_like(data_in_ring);
-
-    ring2nest_dispatch(data_in_ring, data_in_nest, nside, num_elements);
-
-    return data_in_nest;
-}
-
-
-torch::Tensor nest2ring(torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_ring = torch::empty_like(data_in_nest);
-
-    nest2ring_dispatch(data_in_nest, data_in_ring, nside, num_elements);
-
-    return data_in_ring;
-}
-
-torch::Tensor nest2xy(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, 
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_xy = torch::empty_like(data_in_nest);
-
-    int src_origin = 0;
-    int dest_origin;
-
-    bool src_clockwise = false;
-    bool dest_clockwise = clockwise;
-
-    if (origin == "S"){
-        dest_origin = 0;
-    } else if (origin == "E"){
-        dest_origin = 1;
-    } else if (origin == "N"){
-        dest_origin = 2;
-    } else if (origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    nest2xy_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_xy;
-}
-
-torch::Tensor xy2nest(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, 
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_nest = torch::empty_like(data_in_xy);
-
-
-    int src_origin;
-    int dest_origin = 0;
-
-    bool src_clockwise = clockwise;
-    bool dest_clockwise = false;
-
-    if (origin == "S"){
-        src_origin = 0;
-    } else if (origin == "E"){
-        src_origin = 1;
-    } else if (origin == "N"){
-        src_origin = 2;
-    } else if (origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2nest_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_nest;
-}
-
-torch::Tensor ring2xy(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, 
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_xy = torch::empty_like(data_in_ring);
-
-    int src_origin = 0;
-    int dest_origin;
-
-    bool src_clockwise = false;
-    bool dest_clockwise = clockwise;
-
-    if (origin == "S"){
-        dest_origin = 0;
-    } else if (origin == "E"){
-        dest_origin = 1;
-    } else if (origin == "N"){
-        dest_origin = 2;
-    } else if (origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    ring2xy_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_xy;
-}
-
-torch::Tensor xy2ring(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, 
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_ring = torch::empty_like(data_in_xy);
-
-    int src_origin;
-    int dest_origin = 0;
-
-    bool src_clockwise = clockwise;
-    bool dest_clockwise = false;
-
-    if (origin == "S"){
-        src_origin = 0;
-    } else if (origin == "E"){
-        src_origin = 1;
-    } else if (origin == "N"){
-        src_origin = 2;
-    } else if (origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2ring_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_ring;
-}
-
-torch::Tensor xy2xy(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_xy_out = torch::empty_like(data_xy_in);
-
-    int src_origin;
-    int dest_origin;
-
-
-    if (s_origin == "S"){
-        src_origin = 0;
-    } else if (s_origin == "E"){
-        src_origin = 1;
-    } else if (s_origin == "N"){
-        src_origin = 2;
-    } else if (s_origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop");
-    }
-
-    if (d_origin == "S"){
-        dest_origin = 0;
-    } else if (d_origin == "E"){
-        dest_origin = 1;
-    } else if (d_origin == "N"){
-        dest_origin = 2;
-    } else if (d_origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2xy_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_xy_out;
-}
-
-
-torch::Tensor ring2nest_batch(torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_nest = torch::empty_like(data_in_ring);
-
-    ring2nest_batch_dispatch(data_in_ring, data_in_nest, nside, num_elements);
-
-    return data_in_nest;
-}
-
-
-torch::Tensor nest2ring_batch(torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_ring = torch::empty_like(data_in_nest);
-
-    nest2ring_batch_dispatch(data_in_nest, data_in_ring, nside, num_elements);
-
-    return data_in_ring;
-}
-
-torch::Tensor nest2xy_batch(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, 
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_xy = torch::empty_like(data_in_nest);
-
-    int src_origin = 0;
-    int dest_origin;
-
-    bool src_clockwise = false;
-    bool dest_clockwise = clockwise;
-
-    if (origin == "S"){
-        dest_origin = 0;
-    } else if (origin == "E"){
-        dest_origin = 1;
-    } else if (origin == "N"){
-        dest_origin = 2;
-    } else if (origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    nest2xy_batch_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_xy;
-}
-
-torch::Tensor xy2nest_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, 
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_nest = torch::empty_like(data_in_xy);
-
-
-    int src_origin;
-    int dest_origin = 0;
-
-    bool src_clockwise = clockwise;
-    bool dest_clockwise = false;
-
-    if (origin == "S"){
-        src_origin = 0;
-    } else if (origin == "E"){
-        src_origin = 1;
-    } else if (origin == "N"){
-        src_origin = 2;
-    } else if (origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2nest_batch_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_nest;
-}
-
-torch::Tensor ring2xy_batch(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, 
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_xy = torch::empty_like(data_in_ring);
-
-    int src_origin = 0;
-    int dest_origin;
-
-    bool src_clockwise = false;
-    bool dest_clockwise = clockwise;
-
-    if (origin == "S"){
-        dest_origin = 0;
-    } else if (origin == "E"){
-        dest_origin = 1;
-    } else if (origin == "N"){
-        dest_origin = 2;
-    } else if (origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    ring2xy_batch_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_xy;
-}
-
-torch::Tensor xy2ring_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, 
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_in_ring = torch::empty_like(data_in_xy);
-
-    int src_origin;
-    int dest_origin = 0;
-
-    bool src_clockwise = clockwise;
-    bool dest_clockwise = false;
-
-    if (origin == "S"){
-        src_origin = 0;
-    } else if (origin == "E"){
-        src_origin = 1;
-    } else if (origin == "N"){
-        src_origin = 2;
-    } else if (origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2ring_batch_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_in_ring;
-}
-
-torch::Tensor xy2xy_batch(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise,
-        const int nside, const size_t num_elements) {
-
-    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
-
-    // Check if num_elements matches the expected number of elements
-    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
-
-    auto data_xy_out = torch::empty_like(data_xy_in);
-
-    int src_origin;
-    int dest_origin;
-
-
-    if (s_origin == "S"){
-        src_origin = 0;
-    } else if (s_origin == "E"){
-        src_origin = 1;
-    } else if (s_origin == "N"){
-        src_origin = 2;
-    } else if (s_origin == "W"){
-        src_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop");
-    }
-
-    if (d_origin == "S"){
-        dest_origin = 0;
-    } else if (d_origin == "E"){
-        dest_origin = 1;
-    } else if (d_origin == "N"){
-        dest_origin = 2;
-    } else if (d_origin == "W"){
-        dest_origin = 3;
-    } else {
-        TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop");
-    }
-
-
-    xy2xy_batch_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
-
-    return data_xy_out;
-}
-
-
-
-PYBIND11_MODULE(cuhpx_remap, m) {
-
-    m.def("ring2nest", &ring2nest, "Convert ring to nest (CUDA)");
-    m.def("nest2ring", &nest2ring, "Convert nest to ring (CUDA)");
-
-    m.def("nest2xy", &nest2xy, "Convert nest to xy (CUDA)");
-    m.def("xy2nest", &xy2nest, "Convert xy to nest (CUDA)");
-
-    m.def("ring2xy", &ring2xy, "Convert ring to xy (CUDA)");
-    m.def("xy2ring", &xy2ring, "Convert xy to ring (CUDA)");
-
-    m.def("xy2xy", &xy2xy, "Convert xy to xy (CUDA)");
-
-    m.def("ring2nest_batch", &ring2nest_batch, "Convert ring to nest (CUDA) in batch");
-    m.def("nest2ring_batch", &nest2ring_batch, "Convert nest to ring (CUDA) in batch");
-
-    m.def("nest2xy_batch", &nest2xy_batch, "Convert nest to xy (CUDA) in batch");
-    m.def("xy2nest_batch", &xy2nest_batch, "Convert xy to nest (CUDA) in batch");
-
-    m.def("ring2xy_batch", &ring2xy_batch, "Convert ring to xy (CUDA) in batch");
-    m.def("xy2ring_batch", &xy2ring_batch, "Convert xy to ring (CUDA) in batch");
-
-    m.def("xy2xy_batch", &xy2xy_batch, "Convert xy to xy (CUDA) in batch");
-
-}
-
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/torch.h>
+#include "hpx_remapping.h"
+
+torch::Tensor ring2nest(torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_nest = torch::empty_like(data_in_ring);
+
+    ring2nest_dispatch(data_in_ring, data_in_nest, nside, num_elements);
+
+    return data_in_nest;
+}
+
+
+torch::Tensor nest2ring(torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_ring = torch::empty_like(data_in_nest);
+
+    nest2ring_dispatch(data_in_nest, data_in_ring, nside, num_elements);
+
+    return data_in_ring;
+}
+
+torch::Tensor nest2xy(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_xy = torch::empty_like(data_in_nest);
+
+    int src_origin = 0;
+    int dest_origin;
+
+    bool src_clockwise = false;
+    bool dest_clockwise = clockwise;
+
+    if (origin == "S"){
+        dest_origin = 0;
+    } else if (origin == "E"){
+        dest_origin = 1;
+    } else if (origin == "N"){
+        dest_origin = 2;
+    } else if (origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    nest2xy_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_xy;
+}
+
+torch::Tensor xy2nest(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_nest = torch::empty_like(data_in_xy);
+
+
+    int src_origin;
+    int dest_origin = 0;
+
+    bool src_clockwise = clockwise;
+    bool dest_clockwise = false;
+
+    if (origin == "S"){
+        src_origin = 0;
+    } else if (origin == "E"){
+        src_origin = 1;
+    } else if (origin == "N"){
+        src_origin = 2;
+    } else if (origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2nest_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_nest;
+}
+
+torch::Tensor ring2xy(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_xy = torch::empty_like(data_in_ring);
+
+    int src_origin = 0;
+    int dest_origin;
+
+    bool src_clockwise = false;
+    bool dest_clockwise = clockwise;
+
+    if (origin == "S"){
+        dest_origin = 0;
+    } else if (origin == "E"){
+        dest_origin = 1;
+    } else if (origin == "N"){
+        dest_origin = 2;
+    } else if (origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    ring2xy_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_xy;
+}
+
+torch::Tensor xy2ring(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_ring = torch::empty_like(data_in_xy);
+
+    int src_origin;
+    int dest_origin = 0;
+
+    bool src_clockwise = clockwise;
+    bool dest_clockwise = false;
+
+    if (origin == "S"){
+        src_origin = 0;
+    } else if (origin == "E"){
+        src_origin = 1;
+    } else if (origin == "N"){
+        src_origin = 2;
+    } else if (origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2ring_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_ring;
+}
+
+torch::Tensor xy2xy(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_xy_out = torch::empty_like(data_xy_in);
+
+    int src_origin;
+    int dest_origin;
+
+
+    if (s_origin == "S"){
+        src_origin = 0;
+    } else if (s_origin == "E"){
+        src_origin = 1;
+    } else if (s_origin == "N"){
+        src_origin = 2;
+    } else if (s_origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop");
+    }
+
+    if (d_origin == "S"){
+        dest_origin = 0;
+    } else if (d_origin == "E"){
+        dest_origin = 1;
+    } else if (d_origin == "N"){
+        dest_origin = 2;
+    } else if (d_origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2xy_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_xy_out;
+}
+
+void benchmark_nest_ring(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    benchmark_nest_ring_dispatch(data_in_nest, data_in_ring, nside, num_elements);
+}
+
+
+torch::Tensor ring2nest_batch(torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_nest = torch::empty_like(data_in_ring);
+
+    ring2nest_batch_dispatch(data_in_ring, data_in_nest, nside, num_elements);
+
+    return data_in_nest;
+}
+
+
+torch::Tensor nest2ring_batch(torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_ring = torch::empty_like(data_in_nest);
+
+    nest2ring_batch_dispatch(data_in_nest, data_in_ring, nside, num_elements);
+
+    return data_in_ring;
+}
+
+torch::Tensor nest2xy_batch(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_xy = torch::empty_like(data_in_nest);
+
+    int src_origin = 0;
+    int dest_origin;
+
+    bool src_clockwise = false;
+    bool dest_clockwise = clockwise;
+
+    if (origin == "S"){
+        dest_origin = 0;
+    } else if (origin == "E"){
+        dest_origin = 1;
+    } else if (origin == "N"){
+        dest_origin = 2;
+    } else if (origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    nest2xy_batch_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_xy;
+}
+
+torch::Tensor xy2nest_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_nest = torch::empty_like(data_in_xy);
+
+
+    int src_origin;
+    int dest_origin = 0;
+
+    bool src_clockwise = clockwise;
+    bool dest_clockwise = false;
+
+    if (origin == "S"){
+        src_origin = 0;
+    } else if (origin == "E"){
+        src_origin = 1;
+    } else if (origin == "N"){
+        src_origin = 2;
+    } else if (origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2nest_batch_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_nest;
+}
+
+torch::Tensor ring2xy_batch(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_xy = torch::empty_like(data_in_ring);
+
+    int src_origin = 0;
+    int dest_origin;
+
+    bool src_clockwise = false;
+    bool dest_clockwise = clockwise;
+
+    if (origin == "S"){
+        dest_origin = 0;
+    } else if (origin == "E"){
+        dest_origin = 1;
+    } else if (origin == "N"){
+        dest_origin = 2;
+    } else if (origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    ring2xy_batch_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_xy;
+}
+
+torch::Tensor xy2ring_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_in_ring = torch::empty_like(data_in_xy);
+
+    int src_origin;
+    int dest_origin = 0;
+
+    bool src_clockwise = clockwise;
+    bool dest_clockwise = false;
+
+    if (origin == "S"){
+        src_origin = 0;
+    } else if (origin == "E"){
+        src_origin = 1;
+    } else if (origin == "N"){
+        src_origin = 2;
+    } else if (origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2ring_batch_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_in_ring;
+}
+
+torch::Tensor xy2xy_batch(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise,
+        const int nside, const size_t num_elements) {
+
+    const size_t expected_num_elements = static_cast<size_t>(nside)*nside*12;
+
+    // Check if num_elements matches the expected number of elements
+    TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop.");
+
+    auto data_xy_out = torch::empty_like(data_xy_in);
+
+    int src_origin;
+    int dest_origin;
+
+
+    if (s_origin == "S"){
+        src_origin = 0;
+    } else if (s_origin == "E"){
+        src_origin = 1;
+    } else if (s_origin == "N"){
+        src_origin = 2;
+    } else if (s_origin == "W"){
+        src_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop");
+    }
+
+    if (d_origin == "S"){
+        dest_origin = 0;
+    } else if (d_origin == "E"){
+        dest_origin = 1;
+    } else if (d_origin == "N"){
+        dest_origin = 2;
+    } else if (d_origin == "W"){
+        dest_origin = 3;
+    } else {
+        TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop");
+    }
+
+
+    xy2xy_batch_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
+
+    return data_xy_out;
+}
+
+
+
+PYBIND11_MODULE(cuhpx_remap, m) {
+
+    m.def("ring2nest", &ring2nest, "Convert ring to nest (CUDA)");
+    m.def("nest2ring", &nest2ring, "Convert nest to ring (CUDA)");
+
+    m.def("nest2xy", &nest2xy, "Convert nest to xy (CUDA)");
+    m.def("xy2nest", &xy2nest, "Convert xy to nest (CUDA)");
+
+    m.def("ring2xy", &ring2xy, "Convert ring to xy (CUDA)");
+    m.def("xy2ring", &xy2ring, "Convert xy to ring (CUDA)");
+
+    m.def("xy2xy", &xy2xy, "Convert xy to xy (CUDA)");
+
+    m.def("benchmark_nest_ring", &benchmark_nest_ring, "Benchmark nest and ring (CUDA)");
+
+    m.def("ring2nest_batch", &ring2nest_batch, "Convert ring to nest (CUDA) in batch");
+    m.def("nest2ring_batch", &nest2ring_batch, "Convert nest to ring (CUDA) in batch");
+
+    m.def("nest2xy_batch", &nest2xy_batch, "Convert nest to xy (CUDA) in batch");
+    m.def("xy2nest_batch", &xy2nest_batch, "Convert xy to nest (CUDA) in batch");
+
+    m.def("ring2xy_batch", &ring2xy_batch, "Convert ring to xy (CUDA) in batch");
+    m.def("xy2ring_batch", &xy2ring_batch, "Convert xy to ring (CUDA) in batch");
+
+    m.def("xy2xy_batch", &xy2xy_batch, "Convert xy to xy (CUDA) in batch");
+
+}
diff --git a/src/data_remapping/hpx_remapping.h b/src/data_remapping/hpx_remapping.h
index 5ed936a..2edc77d 100644
--- a/src/data_remapping/hpx_remapping.h
+++ b/src/data_remapping/hpx_remapping.h
@@ -43,6 +43,9 @@ void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, cons
 void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise,
                     const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements);
 
+void benchmark_nest_ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements);
+
+
 void ring2nest_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements);
 
 void nest2ring_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements);
@@ -63,4 +66,3 @@ void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, c
                     const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements);
 
 #endif // HPX_REMAPPING_H
-
diff --git a/src/data_remapping/hpx_remapping_cuda.cu b/src/data_remapping/hpx_remapping_cuda.cu
index eb0f677..41cf8f0 100644
--- a/src/data_remapping/hpx_remapping_cuda.cu
+++ b/src/data_remapping/hpx_remapping_cuda.cu
@@ -133,15 +133,15 @@ __device__ inline void correct_xy_orient(int& ix, int& iy, const int order, cons
     iy = new_iy;
 }
 
-template <typename I, typename T> 
+template <typename I, typename T>
 __global__ void rearrange_data_kernel_naive(const T* d_data_in, T* d_data_out, const I* d_pix_array, const size_t num_elements){
-    
+
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (tid < num_elements){
         I pix = d_pix_array[tid];
         d_data_out[pix] = d_data_in[tid];
-        
+
     }
 }
 
@@ -157,7 +157,7 @@ template <typename I>
 }
 
 
-template <typename I> 
+template <typename I>
 __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array, int* face_num_array,
     const int order, const I nside, const size_t num_elements){
 
@@ -192,10 +192,10 @@ __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array
             I irm = 2*nside + 1 - tmp;
             I ifm = iphi - (ire >> 1) + nside - 1;
             I ifp = iphi - (irm >> 1) + nside - 1;
- 
+
             ifm >>= order;
             ifp >>= order;
-     
+
             face_num = (ifp == ifm) ? (ifp | 4) : ((ifp < ifm) ? ifp : (ifm + 8));
         } else { // South Polar Cap
             I ip = npix - pix;
@@ -224,8 +224,8 @@ __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array
 }
 
 
-template<typename I> 
-__global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array, 
+template<typename I>
+__global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array,
         const int order, const I nside, const size_t num_elements) {
 
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -281,7 +281,7 @@ __global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy
     }
 }
 
-template <typename I> 
+template <typename I>
 __global__ void nest2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array, int* face_num_array,
     const int order, const I nside, const size_t num_elements) {
 
@@ -303,7 +303,7 @@ __global__ void nest2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array
     }
 }
 
-template <typename I> 
+template <typename I>
 __global__ void xyf2nest_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array,
     const int order, const I nside, const size_t num_elements){
 
@@ -601,7 +601,7 @@ void ring2nest_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_
     ring2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
     xyf2nest_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
     rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, num_elements);
-    
+
     //cudaDeviceSynchronize();
 
     cudaFree(d_pix_array);
@@ -633,7 +633,7 @@ void nest2ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_
     nest2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
     xyf2ring_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
     rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, num_elements);
-    
+
     //cudaDeviceSynchronize();
 
     cudaFree(d_pix_array);
@@ -643,6 +643,170 @@ void nest2ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_
 }
 
 
+template <typename T>
+void benchmark_nest_ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
+
+    int block_size = 1024;
+    int num_blocks = (num_elements + block_size - 1) / block_size;
+
+    int order = compute_order(nside);
+
+    int* d_pix_array;
+    int* d_ix_array;
+    int* d_iy_array;
+    int* d_face_num_array;
+
+    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
+    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
+
+
+    float total_elapsed_time_r2n = 0.0f;
+    float total_elapsed_time_n2r = 0.0f;
+
+    float total_elapsed_time_n2xy = 0.0f;
+    float total_elapsed_time_xy2n = 0.0f;
+
+    float elapsed_time;
+
+
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    int num_runs = 3;
+    int k = 2;
+    bool flip = false;
+
+    // warm-up run
+    for (int i = 0; i < num_runs; ++i)
+    {
+        // nest 2 ring
+        initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
+        nest2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        xyf2ring_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, num_elements);
+
+        // ring 2 nest
+        initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
+        ring2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        xyf2nest_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, num_elements);
+
+        // nest2xy
+        initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
+        nest2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        xyf2flat_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
+        rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, num_elements);
+
+        // xy2nest
+        initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
+        flat2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
+        xyf2nest_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        rearrange_data_kernel_naive <<<num_blocks, block_size>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, num_elements);
+    }
+
+    cudaDeviceSynchronize();
+
+    num_runs = 10;
+
+
+    for (int i = 0; i < num_runs; ++i)
+    {
+
+        // N2R
+        cudaEventRecord(start, stream);
+
+        initialize_pix_array<<<num_blocks, block_size, 0, stream>>>(d_pix_array, num_elements);
+        nest2xyf_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        xyf2ring_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        rearrange_data_kernel_naive <<<num_blocks, block_size, 0, stream>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, num_elements);
+
+        cudaEventRecord(stop, stream);
+        cudaEventSynchronize(stop);
+
+        elapsed_time = 0.0f;
+        cudaEventElapsedTime(&elapsed_time, start, stop);
+        total_elapsed_time_n2r += elapsed_time;
+
+        // R2N
+
+        cudaEventRecord(start, stream);
+
+        initialize_pix_array<<<num_blocks, block_size, 0, stream>>>(d_pix_array, num_elements);
+        ring2xyf_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        xyf2nest_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        rearrange_data_kernel_naive <<<num_blocks, block_size, 0, stream>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, num_elements);
+
+        cudaEventRecord(stop, stream);
+        cudaEventSynchronize(stop);
+
+        elapsed_time = 0.0f;
+        cudaEventElapsedTime(&elapsed_time, start, stop);
+        total_elapsed_time_r2n += elapsed_time;
+
+        // nest2xy
+        cudaEventRecord(start, stream);
+
+        initialize_pix_array<<<num_blocks, block_size, 0, stream>>>(d_pix_array, num_elements);
+        nest2xyf_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        xyf2flat_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
+        rearrange_data_kernel_naive <<<num_blocks, block_size, 0, stream>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, num_elements);
+
+        cudaEventRecord(stop, stream);
+        cudaEventSynchronize(stop);
+
+        elapsed_time = 0.0f;
+        cudaEventElapsedTime(&elapsed_time, start, stop);
+        total_elapsed_time_n2xy += elapsed_time;
+
+        // xy2nest
+        cudaEventRecord(start, stream);
+
+        initialize_pix_array<<<num_blocks, block_size, 0, stream>>>(d_pix_array, num_elements);
+        flat2xyf_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
+        xyf2nest_kernel<<<num_blocks, block_size, 0, stream>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+        rearrange_data_kernel_naive <<<num_blocks, block_size, 0, stream>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, num_elements);
+
+        cudaEventRecord(stop, stream);
+        cudaEventSynchronize(stop);
+
+        elapsed_time = 0.0f;
+        cudaEventElapsedTime(&elapsed_time, start, stop);
+        total_elapsed_time_xy2n += elapsed_time;
+
+    }
+
+    float average_elapsed_time_n2r = total_elapsed_time_n2r / num_runs;
+    float average_elapsed_time_r2n = total_elapsed_time_r2n / num_runs;
+    float average_elapsed_time_n2xy = total_elapsed_time_n2xy / num_runs;
+    float average_elapsed_time_xy2n = total_elapsed_time_xy2n / num_runs;
+
+
+    std::cout << "Average elapsed time used by CUDA data ring2nest kernel over " << num_runs << " runs: " << average_elapsed_time_r2n << "ms" << std::endl;
+    std::cout << "Average elapsed time used by CUDA data nest2ring kernel over " << num_runs << " runs: " << average_elapsed_time_n2r << "ms" << std::endl;
+
+    std::cout << "Average elapsed time used by CUDA data nest2XY kernel over " << num_runs << " runs: " << average_elapsed_time_n2xy << "ms" << std::endl;
+    std::cout << "Average elapsed time used by CUDA data XY2nest kernel over " << num_runs << " runs: " << average_elapsed_time_xy2n << "ms" << std::endl;
+
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+    cudaStreamDestroy(stream);
+
+    cudaDeviceSynchronize();
+
+
+    cudaFree(d_pix_array);
+    cudaFree(d_ix_array);
+    cudaFree(d_iy_array);
+    cudaFree(d_face_num_array);
+}
+
+
 // Dispatch functions for each wrapper
 void ring2nest_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
 
@@ -661,11 +825,11 @@ void nest2ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring,
 
 void nest2xy_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise,
                       const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) {
-    
+
     AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "nest2xy", ([&] {
         nest2xy_kernel_wrapper<scalar_t>(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-    
+
 }
 
 void xy2nest_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_nest, const int src_origin, const bool src_clockwise,
@@ -683,7 +847,7 @@ void ring2xy_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_xy, cons
     AT_DISPATCH_ALL_TYPES(data_in_ring.scalar_type(), "ring2xy", ([&] {
         ring2xy_kernel_wrapper<scalar_t>(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-    
+
 }
 
 void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, const int src_origin, const bool src_clockwise,
@@ -692,7 +856,7 @@ void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, cons
     AT_DISPATCH_ALL_TYPES(data_in_xy.scalar_type(), "xy2ring", ([&] {
         xy2ring_kernel_wrapper<scalar_t>(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-    
+
 }
 
 void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise,
@@ -701,51 +865,61 @@ void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const i
     AT_DISPATCH_ALL_TYPES(data_xy_in.scalar_type(), "xy2xy", ([&] {
         xy2xy_kernel_wrapper<scalar_t>(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-    
+
 }
 
+void benchmark_nest_ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) {
+
+    AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "benchmark_nest_ring", ([&] {
+        benchmark_nest_ring_kernel_wrapper<scalar_t>(data_in_nest, data_in_ring, nside, num_elements);
+    }));
+
+}
+
+
+
 
 template <typename I, typename T>
-__global__ void rearrange_data_kernel_3d_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, 
+__global__ void rearrange_data_kernel_3d_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array,
         const size_t m, const size_t n, const size_t num_elements) {
-    
+
     // d_data_in: m by n by num_elements 3D matrix
 
     int idz = blockIdx.z * blockDim.z + threadIdx.z;
     int idy = blockIdx.y * blockDim.y + threadIdx.y;
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    
+
     int tid = idz * n * num_elements + idy * num_elements + idx;
-    
+
     if (idz < m && idy < n && idx < num_elements) {
 
-        I pix = d_pix_array[idx];             
+        I pix = d_pix_array[idx];
 
         d_data_out[tid-idx + pix] = d_data_in[tid];
     }
 }
 
 template <typename I, typename T>
-__global__ void rearrange_data_kernel_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, 
+__global__ void rearrange_data_kernel_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array,
         const size_t n, const size_t num_elements) {
 
     unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    
+
     unsigned int tid = idy * num_elements + idx;
-    
+
     if (idy < n && idx < num_elements) {
 
-        I pix = d_pix_array[idx];             
+        I pix = d_pix_array[idx];
 
         d_data_out[tid-idx + pix] = d_data_in[tid];
     }
 }
 
 template <typename T>
-void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out, 
+void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out,
                           const int src_origin, const bool src_clockwise,
-                          const int dest_origin, const bool dest_clockwise, 
+                          const int dest_origin, const bool dest_clockwise,
                           const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -791,9 +965,9 @@ void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_
 
 
 template <typename T>
-void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy, 
+void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy,
                             const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise, 
+                            const int dest_origin, const bool dest_clockwise,
                             const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -848,9 +1022,9 @@ void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data
 }
 
 template <typename T>
-void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest, 
+void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest,
                             const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise, 
+                            const int dest_origin, const bool dest_clockwise,
                             const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -905,9 +1079,9 @@ void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_i
 }
 
 template <typename T>
-void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy, 
+void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy,
                             const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise, 
+                            const int dest_origin, const bool dest_clockwise,
                             const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -963,9 +1137,9 @@ void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data
 
 
 template <typename T>
-void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring, 
+void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring,
                             const int src_origin, const bool src_clockwise,
-                            const int dest_origin, const bool dest_clockwise, 
+                            const int dest_origin, const bool dest_clockwise,
                             const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -1021,7 +1195,7 @@ void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_i
 
 
 template <typename T>
-void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest, 
+void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest,
                               const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -1070,7 +1244,7 @@ void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor da
 }
 
 template <typename T>
-void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, 
+void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring,
                               const int nside, const size_t num_elements) {
 
     // Check tensor dimensions and sizes
@@ -1118,6 +1292,372 @@ void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor da
     cudaFree(d_face_num_array);
 }
 
+/*
+template <typename T>
+void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out,
+                          const int src_origin, const bool src_clockwise,
+                          const int dest_origin, const bool dest_clockwise,
+                          const int nside, const size_t num_elements) {
+
+    // Check tensor dimensions and sizes
+    TORCH_CHECK(data_xy_in.dim() == 3, "data_xy_in must be a 3D tensor");
+    TORCH_CHECK(data_xy_out.dim() == 3, "data_xy_out must be a 3D tensor");
+
+    int m = data_xy_in.size(0); // First dimension
+    int n = data_xy_in.size(1); // Second dimension
+
+    int order = compute_order(nside);
+
+    int block_size = 256;
+    int num_blocks = (num_elements + block_size - 1)/ block_size;
+
+    // Calculate block and grid dimensions
+    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
+    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
+                  (n + block_dim.y - 1) / block_dim.y,
+                  (m + block_dim.z - 1) / block_dim.z);
+
+    // Allocate device memory
+    int* d_pix_array;
+
+    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
+
+    // Compute rotation parameters
+    bool flip = src_clockwise != dest_clockwise;
+    int rotations = dest_origin - src_origin;
+    int k = (dest_clockwise ? -rotations : rotations) & 3;
+    if (k < 0) { k += 4; }
+
+    // Launch kernels
+    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
+    flat2flat_kernel<<<num_blocks, block_size>>>(d_pix_array, order, nside, num_elements, flip, k);
+
+    // Adjust kernel launch for 3D data
+    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_xy_in.data_ptr<T>(), data_xy_out.data_ptr<T>(), d_pix_array, m, n, num_elements);
+
+    // Free device memory
+    cudaFree(d_pix_array);
+}
+
+
+template <typename T>
+void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy,
+                            const int src_origin, const bool src_clockwise,
+                            const int dest_origin, const bool dest_clockwise,
+                            const int nside, const size_t num_elements) {
+
+    // Check tensor dimensions and sizes
+    TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest should be a 3D tensor");
+    TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy should be a 3D tensor");
+
+    int m = data_in_nest.size(0); // First dimension
+    int n = data_in_nest.size(1); // Second dimension
+
+    int order = compute_order(nside);
+
+    // Calculate block and grid dimensions
+    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
+    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
+                  (n + block_dim.y - 1) / block_dim.y,
+                  (m + block_dim.z - 1) / block_dim.z);
+
+    int block_size = 256;
+    int num_blocks = (num_elements + block_size - 1)/ block_size;
+
+    // Allocate device memory
+    int* d_pix_array;
+    int* d_ix_array;
+    int* d_iy_array;
+    int* d_face_num_array;
+
+    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
+    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
+
+    // Compute rotation parameters
+    bool flip = src_clockwise != dest_clockwise;
+    int rotations = dest_origin - src_origin;
+    int k = (dest_clockwise ? -rotations : rotations) & 3;
+    if (k < 0) { k += 4; }
+
+    // Launch kernels
+    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
+    nest2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+    xyf2flat_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
+
+    // Adjust kernel launch for 3D data
+    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_nest.data_ptr<T>(), data_in_xy.data_ptr<T>(), d_pix_array, m, n, num_elements);
+
+    // Free device memory
+    cudaFree(d_pix_array);
+    cudaFree(d_ix_array);
+    cudaFree(d_iy_array);
+    cudaFree(d_face_num_array);
+}
+
+template <typename T>
+void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest,
+                            const int src_origin, const bool src_clockwise,
+                            const int dest_origin, const bool dest_clockwise,
+                            const int nside, const size_t num_elements) {
+
+    // Check tensor dimensions and sizes
+    TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor");
+    TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor");
+
+    int m = data_in_xy.size(0); // First dimension
+    int n = data_in_xy.size(1); // Second dimension
+
+    int order = compute_order(nside);
+
+    int block_size = 256;
+    int num_blocks = (num_elements + block_size - 1)/ block_size;
+
+    // Calculate block and grid dimensions
+    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
+    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
+                  (n + block_dim.y - 1) / block_dim.y,
+                  (m + block_dim.z - 1) / block_dim.z);
+
+    // Allocate device memory
+    int* d_pix_array;
+    int* d_ix_array;
+    int* d_iy_array;
+    int* d_face_num_array;
+
+    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
+    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
+
+    // Compute rotation parameters
+    bool flip = src_clockwise != dest_clockwise;
+    int rotations = dest_origin - src_origin;
+    int k = (dest_clockwise ? -rotations : rotations) & 3;
+    if (k < 0) { k += 4; }
+
+    // Launch kernels
+    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
+    flat2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
+    xyf2nest_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+
+    // Adjust kernel launch for 3D data
+    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_xy.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, m, n, num_elements);
+
+    // Free device memory
+    cudaFree(d_pix_array);
+    cudaFree(d_ix_array);
+    cudaFree(d_iy_array);
+    cudaFree(d_face_num_array);
+}
+
+template <typename T>
+void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy,
+                            const int src_origin, const bool src_clockwise,
+                            const int dest_origin, const bool dest_clockwise,
+                            const int nside, const size_t num_elements) {
+
+    // Check tensor dimensions and sizes
+    TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor");
+    TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor");
+
+    int m = data_in_ring.size(0); // First dimension
+    int n = data_in_ring.size(1); // Second dimension
+    int order = compute_order(nside);
+
+    int block_size = 256;
+    int num_blocks = (num_elements + block_size - 1)/ block_size;
+
+    // Calculate block and grid dimensions
+    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
+    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
+                  (n + block_dim.y - 1) / block_dim.y,
+                  (m + block_dim.z - 1) / block_dim.z);
+
+    // Allocate device memory
+    int* d_pix_array;
+    int* d_ix_array;
+    int* d_iy_array;
+    int* d_face_num_array;
+
+    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
+    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
+
+    // Compute rotation parameters
+    bool flip = src_clockwise != dest_clockwise;
+    int rotations = dest_origin - src_origin;
+    int k = (dest_clockwise ? -rotations : rotations) & 3;
+    if (k < 0) { k += 4; }
+
+    // Launch kernels
+    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
+    ring2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+    xyf2flat_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
+
+    // Adjust kernel launch for 3D data
+    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_ring.data_ptr<T>(), data_in_xy.data_ptr<T>(), d_pix_array, m, n, num_elements);
+
+    // Free device memory
+    cudaFree(d_pix_array);
+    cudaFree(d_ix_array);
+    cudaFree(d_iy_array);
+    cudaFree(d_face_num_array);
+}
+
+
+template <typename T>
+void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring,
+                            const int src_origin, const bool src_clockwise,
+                            const int dest_origin, const bool dest_clockwise,
+                            const int nside, const size_t num_elements) {
+
+    // Check tensor dimensions and sizes
+    TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor");
+    TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor");
+
+    int m = data_in_xy.size(0); // First dimension
+    int n = data_in_xy.size(1); // Second dimension
+    int order = compute_order(nside);
+
+    int block_size = 256;
+    int num_blocks = (num_elements + block_size - 1)/ block_size;
+
+    // Calculate block and grid dimensions
+    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
+    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
+                  (n + block_dim.y - 1) / block_dim.y,
+                  (m + block_dim.z - 1) / block_dim.z);
+
+    // Allocate device memory
+    int* d_pix_array;
+    int* d_ix_array;
+    int* d_iy_array;
+    int* d_face_num_array;
+
+    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
+    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
+
+    // Compute rotation parameters
+    bool flip = src_clockwise != dest_clockwise;
+    int rotations = dest_origin - src_origin;
+    int k = (dest_clockwise ? -rotations : rotations) & 3;
+    if (k < 0) { k += 4; }
+
+    // Launch kernels
+    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
+    flat2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k);
+    xyf2ring_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+
+    // Adjust kernel launch for 3D data
+    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_xy.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, m, n, num_elements);
+
+    // Free device memory
+    cudaFree(d_pix_array);
+    cudaFree(d_ix_array);
+    cudaFree(d_iy_array);
+    cudaFree(d_face_num_array);
+}
+
+
+template <typename T>
+void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest,
+                              const int nside, const size_t num_elements) {
+
+    // Check tensor dimensions and sizes
+    TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor");
+    TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor");
+
+    int m = data_in_ring.size(0); // First dimension
+    int n = data_in_ring.size(1); // Second dimension
+    int order = compute_order(nside);
+
+    int block_size = 256;
+    int num_blocks = (num_elements + block_size - 1)/ block_size;
+
+    // Calculate block and grid dimensions
+    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
+    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
+                  (n + block_dim.y - 1) / block_dim.y,
+                  (m + block_dim.z - 1) / block_dim.z);
+
+    // Allocate device memory
+    int* d_pix_array;
+    int* d_ix_array;
+    int* d_iy_array;
+    int* d_face_num_array;
+
+    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
+    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
+
+    // Launch kernels
+    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
+    ring2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+    xyf2nest_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+
+    // Adjust kernel launch for 3D data
+    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_ring.data_ptr<T>(), data_in_nest.data_ptr<T>(), d_pix_array, m, n, num_elements);
+
+    // Free device memory
+    cudaFree(d_pix_array);
+    cudaFree(d_ix_array);
+    cudaFree(d_iy_array);
+    cudaFree(d_face_num_array);
+}
+
+template <typename T>
+void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring,
+                              const int nside, const size_t num_elements) {
+
+    // Check tensor dimensions and sizes
+    TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor");
+    TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor");
+
+    int m = data_in_nest.size(0); // First dimension
+    int n = data_in_nest.size(1); // Second dimension
+    int order = compute_order(nside);
+
+    int block_size = 256;
+    int num_blocks = (num_elements + block_size - 1)/ block_size;
+
+    // Calculate block and grid dimensions
+    dim3 block_dim(32, 4, 4); // Example 3D block dimensions
+    dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x,
+                  (n + block_dim.y - 1) / block_dim.y,
+                  (m + block_dim.z - 1) / block_dim.z);
+
+    // Allocate device memory
+    int* d_pix_array;
+    int* d_ix_array;
+    int* d_iy_array;
+    int* d_face_num_array;
+
+    cudaMalloc(&d_pix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_ix_array, num_elements * sizeof(int));
+    cudaMalloc(&d_iy_array, num_elements * sizeof(int));
+    cudaMalloc(&d_face_num_array, num_elements * sizeof(int));
+
+    // Launch kernels
+    initialize_pix_array<<<num_blocks, block_size>>>(d_pix_array, num_elements);
+    nest2xyf_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+    xyf2ring_kernel<<<num_blocks, block_size>>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements);
+
+    // Adjust kernel launch for 3D data
+    rearrange_data_kernel_3d_batch<<<grid_dim, block_dim>>>(data_in_nest.data_ptr<T>(), data_in_ring.data_ptr<T>(), d_pix_array, m, n, num_elements);
+
+    // Free device memory
+    cudaFree(d_pix_array);
+    cudaFree(d_ix_array);
+    cudaFree(d_iy_array);
+    cudaFree(d_face_num_array);
+}
+*/
 
 // Dispatch functions for each wrapper
 void ring2nest_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements) {
@@ -1137,11 +1677,11 @@ void nest2ring_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_
 
 void nest2xy_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise,
                       const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) {
-    
+
     AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "nest2xy_batch", ([&] {
         nest2xy_batch_kernel_wrapper<scalar_t>(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-    
+
 }
 
 void xy2nest_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_nest, const int src_origin, const bool src_clockwise,
@@ -1159,7 +1699,7 @@ void ring2xy_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_xy
     AT_DISPATCH_ALL_TYPES(data_in_ring.scalar_type(), "ring2xy_batch", ([&] {
         ring2xy_batch_kernel_wrapper<scalar_t>(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-    
+
 }
 
 void xy2ring_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, const int src_origin, const bool src_clockwise,
@@ -1168,7 +1708,7 @@ void xy2ring_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring
     AT_DISPATCH_ALL_TYPES(data_in_xy.scalar_type(), "xy2ring_batch", ([&] {
         xy2ring_batch_kernel_wrapper<scalar_t>(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-    
+
 }
 
 void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise,
@@ -1177,6 +1717,5 @@ void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, c
     AT_DISPATCH_ALL_TYPES(data_xy_in.scalar_type(), "xy2xy_batch", ([&] {
         xy2xy_batch_kernel_wrapper<scalar_t>(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements);
     }));
-    
-}
 
+}
diff --git a/src/harmonic_transform/hpx_fft.cpp b/src/harmonic_transform/hpx_fft.cpp
index ddacfd2..9ec28ac 100644
--- a/src/harmonic_transform/hpx_fft.cpp
+++ b/src/harmonic_transform/hpx_fft.cpp
@@ -15,7 +15,6 @@
  * limitations under the License.
  */
 
-#include <torch/extension.h> 
 #include "hpx_fft.h"
 #include <cmath>
 
@@ -422,6 +421,11 @@ torch::Tensor healpix_irfft_cuda(torch::Tensor ftm, int L, int nside) {
 }
 
 
+// PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+//     m.def("healpix_rfft_cuda", &healpix_rfft_cuda, "HEALPix RFFT with cuFFT and CUDA");
+//     m.def("healpix_irfft_cuda", &healpix_irfft_cuda, "HEALPix IRFFT with cuFFT and CUDA");
+// }
+
 int nphi_ring(int t, int nside);
 int cumulative_nphi_ring(int t, int nside);
 double p2phi_ring(int t, int offset, int nside);

From 5de3f7c6a2afd45d68368d139c11a25676021bf2 Mon Sep 17 00:00:00 2001
From: xiaopoc <xiaopoc@nvidia.com>
Date: Wed, 13 Aug 2025 11:10:19 -0700
Subject: [PATCH 6/6] update

---
 pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 326b0fa..7cf4538 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,8 +7,7 @@ name = "cuHPX"
 version = "2025.8.1"
 description = "CUDA-accelerated HEALPix tools for harmonic transforms and remapping"
 authors = [
-  { name = "Xiaopo Cheng", email = "xiaopoc@nvidia.com" },
-  { name = "Akshay Subramaniam", email = "asubramaniam@nvidia.com" }
+  { name = "NVIDIA", email = "asubramaniam@nvidia.com" }
 ]
 readme = "README.md"
 license = { file = "LICENSE.txt" }