From 8239fc7fd725e966ca68d47c82809dbf2d67b00f Mon Sep 17 00:00:00 2001 From: xiaopoc Date: Mon, 2 Jun 2025 11:35:31 -0700 Subject: [PATCH 1/6] Add CMakeLists.txt --- CMakeLists.txt | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..99a9700 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,69 @@ +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +project(cuhpx LANGUAGES CXX CUDA) + +# C++ and CUDA standard +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Find Python and determine site-packages path +find_package(Python REQUIRED COMPONENTS Interpreter Development.Module) +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import sysconfig; print(sysconfig.get_paths()['purelib'])" + OUTPUT_VARIABLE PYTHON_SITE_PACKAGES + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +# Add PyTorch CMake path +list(APPEND CMAKE_PREFIX_PATH "${PYTHON_SITE_PACKAGES}/torch/share/cmake") + +# Find required packages +find_package(Torch REQUIRED) +find_package(pybind11 REQUIRED) +find_package(CUDA REQUIRED) + +# Optional: Locate libtorch_python.so dynamically +set(TORCH_PYTHON_LIB "${PYTHON_SITE_PACKAGES}/torch/lib/libtorch_python.so") + +# Set CUDA architecture (change if needed) +set(CMAKE_CUDA_ARCHITECTURES 90) +list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr --expt-extended-lambda -O3) + +# Include paths +include_directories( + ${TORCH_INCLUDE_DIRS} + ${CUDA_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/src +) + +# ==================== cuhpx_fft ==================== +set(CUHPX_FFT_SRC + src/harmonic_transform/hpx_fft.cpp + src/harmonic_transform/hpx_fft_cuda.cu +) + +pybind11_add_module(cuhpx_fft MODULE ${CUHPX_FFT_SRC}) +target_link_libraries(cuhpx_fft PRIVATE ${TORCH_LIBRARIES} ${CUDA_LIBRARIES}) +target_link_libraries(cuhpx_fft PRIVATE ${TORCH_PYTHON_LIB}) +target_compile_definitions(cuhpx_fft PRIVATE ${TORCH_CXX_FLAGS}) +set_target_properties(cuhpx_fft PROPERTIES PREFIX "" LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx) + +# ==================== cuhpx_remap ==================== +set(CUHPX_REMAP_SRC + src/data_remapping/hpx_remapping.cpp + src/data_remapping/hpx_remapping_cuda.cu +) + +pybind11_add_module(cuhpx_remap MODULE ${CUHPX_REMAP_SRC}) +target_link_libraries(cuhpx_remap PRIVATE ${TORCH_LIBRARIES} ${CUDA_LIBRARIES}) +target_link_libraries(cuhpx_remap PRIVATE ${TORCH_PYTHON_LIB}) +target_compile_definitions(cuhpx_remap PRIVATE ${TORCH_CXX_FLAGS}) +set_target_properties(cuhpx_remap PROPERTIES PREFIX "" LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx) + +# ==================== Install ==================== +install(TARGETS cuhpx_fft cuhpx_remap DESTINATION cuhpx) +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx/data DESTINATION cuhpx FILES_MATCHING PATTERN "*.fits") + +message(STATUS "✅ cuhpx build system configured successfully.") + + From 0de9f5d2b31b89b3a68d87b8fd19894d60eb4339 Mon Sep 17 00:00:00 2001 From: xiaopoc Date: Mon, 2 Jun 2025 11:42:11 -0700 Subject: [PATCH 2/6] Clean up --- src/data_remapping/hpx_remapping.cpp | 946 +++++++++++------------ src/data_remapping/hpx_remapping.h | 4 +- src/data_remapping/hpx_remapping_cuda.cu | 627 ++------------- src/harmonic_transform/hpx_fft.cpp | 5 - 4 files changed, 513 insertions(+), 1069 deletions(-) diff --git a/src/data_remapping/hpx_remapping.cpp b/src/data_remapping/hpx_remapping.cpp index fa136d0..aaff451 100644 --- a/src/data_remapping/hpx_remapping.cpp +++ b/src/data_remapping/hpx_remapping.cpp @@ -1,478 +1,468 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include "hpx_remapping.h" - -torch::Tensor ring2nest(torch::Tensor data_in_ring, const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_nest = torch::empty_like(data_in_ring); - - ring2nest_dispatch(data_in_ring, data_in_nest, nside, num_elements); - - return data_in_nest; -} - - -torch::Tensor nest2ring(torch::Tensor data_in_nest, const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_ring = torch::empty_like(data_in_nest); - - nest2ring_dispatch(data_in_nest, data_in_ring, nside, num_elements); - - return data_in_ring; -} - -torch::Tensor nest2xy(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_xy = torch::empty_like(data_in_nest); - - int src_origin = 0; - int dest_origin; - - bool src_clockwise = false; - bool dest_clockwise = clockwise; - - if (origin == "S"){ - dest_origin = 0; - } else if (origin == "E"){ - dest_origin = 1; - } else if (origin == "N"){ - dest_origin = 2; - } else if (origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - nest2xy_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_xy; -} - -torch::Tensor xy2nest(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_nest = torch::empty_like(data_in_xy); - - - int src_origin; - int dest_origin = 0; - - bool src_clockwise = clockwise; - bool dest_clockwise = false; - - if (origin == "S"){ - src_origin = 0; - } else if (origin == "E"){ - src_origin = 1; - } else if (origin == "N"){ - src_origin = 2; - } else if (origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - xy2nest_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_nest; -} - -torch::Tensor ring2xy(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_xy = torch::empty_like(data_in_ring); - - int src_origin = 0; - int dest_origin; - - bool src_clockwise = false; - bool dest_clockwise = clockwise; - - if (origin == "S"){ - dest_origin = 0; - } else if (origin == "E"){ - dest_origin = 1; - } else if (origin == "N"){ - dest_origin = 2; - } else if (origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - ring2xy_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_xy; -} - -torch::Tensor xy2ring(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_ring = torch::empty_like(data_in_xy); - - int src_origin; - int dest_origin = 0; - - bool src_clockwise = clockwise; - bool dest_clockwise = false; - - if (origin == "S"){ - src_origin = 0; - } else if (origin == "E"){ - src_origin = 1; - } else if (origin == "N"){ - src_origin = 2; - } else if (origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - xy2ring_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_ring; -} - -torch::Tensor xy2xy(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_xy_out = torch::empty_like(data_xy_in); - - int src_origin; - int dest_origin; - - - if (s_origin == "S"){ - src_origin = 0; - } else if (s_origin == "E"){ - src_origin = 1; - } else if (s_origin == "N"){ - src_origin = 2; - } else if (s_origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop"); - } - - if (d_origin == "S"){ - dest_origin = 0; - } else if (d_origin == "E"){ - dest_origin = 1; - } else if (d_origin == "N"){ - dest_origin = 2; - } else if (d_origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop"); - } - - - xy2xy_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_xy_out; -} - -void benchmark_nest_ring(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - benchmark_nest_ring_dispatch(data_in_nest, data_in_ring, nside, num_elements); -} - - -torch::Tensor ring2nest_batch(torch::Tensor data_in_ring, const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_nest = torch::empty_like(data_in_ring); - - ring2nest_batch_dispatch(data_in_ring, data_in_nest, nside, num_elements); - - return data_in_nest; -} - - -torch::Tensor nest2ring_batch(torch::Tensor data_in_nest, const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_ring = torch::empty_like(data_in_nest); - - nest2ring_batch_dispatch(data_in_nest, data_in_ring, nside, num_elements); - - return data_in_ring; -} - -torch::Tensor nest2xy_batch(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_xy = torch::empty_like(data_in_nest); - - int src_origin = 0; - int dest_origin; - - bool src_clockwise = false; - bool dest_clockwise = clockwise; - - if (origin == "S"){ - dest_origin = 0; - } else if (origin == "E"){ - dest_origin = 1; - } else if (origin == "N"){ - dest_origin = 2; - } else if (origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - nest2xy_batch_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_xy; -} - -torch::Tensor xy2nest_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_nest = torch::empty_like(data_in_xy); - - - int src_origin; - int dest_origin = 0; - - bool src_clockwise = clockwise; - bool dest_clockwise = false; - - if (origin == "S"){ - src_origin = 0; - } else if (origin == "E"){ - src_origin = 1; - } else if (origin == "N"){ - src_origin = 2; - } else if (origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - xy2nest_batch_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_nest; -} - -torch::Tensor ring2xy_batch(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_xy = torch::empty_like(data_in_ring); - - int src_origin = 0; - int dest_origin; - - bool src_clockwise = false; - bool dest_clockwise = clockwise; - - if (origin == "S"){ - dest_origin = 0; - } else if (origin == "E"){ - dest_origin = 1; - } else if (origin == "N"){ - dest_origin = 2; - } else if (origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - ring2xy_batch_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_xy; -} - -torch::Tensor xy2ring_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_ring = torch::empty_like(data_in_xy); - - int src_origin; - int dest_origin = 0; - - bool src_clockwise = clockwise; - bool dest_clockwise = false; - - if (origin == "S"){ - src_origin = 0; - } else if (origin == "E"){ - src_origin = 1; - } else if (origin == "N"){ - src_origin = 2; - } else if (origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - xy2ring_batch_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_ring; -} - -torch::Tensor xy2xy_batch(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_xy_out = torch::empty_like(data_xy_in); - - int src_origin; - int dest_origin; - - - if (s_origin == "S"){ - src_origin = 0; - } else if (s_origin == "E"){ - src_origin = 1; - } else if (s_origin == "N"){ - src_origin = 2; - } else if (s_origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop"); - } - - if (d_origin == "S"){ - dest_origin = 0; - } else if (d_origin == "E"){ - dest_origin = 1; - } else if (d_origin == "N"){ - dest_origin = 2; - } else if (d_origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop"); - } - - - xy2xy_batch_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_xy_out; -} - - - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - - m.def("ring2nest", &ring2nest, "Convert ring to nest (CUDA)"); - m.def("nest2ring", &nest2ring, "Convert nest to ring (CUDA)"); - - m.def("nest2xy", &nest2xy, "Convert nest to xy (CUDA)"); - m.def("xy2nest", &xy2nest, "Convert xy to nest (CUDA)"); - - m.def("ring2xy", &ring2xy, "Convert ring to xy (CUDA)"); - m.def("xy2ring", &xy2ring, "Convert xy to ring (CUDA)"); - - m.def("xy2xy", &xy2xy, "Convert xy to xy (CUDA)"); - - m.def("benchmark_nest_ring", &benchmark_nest_ring, "Benchmark nest and ring (CUDA)"); - - m.def("ring2nest_batch", &ring2nest_batch, "Convert ring to nest (CUDA) in batch"); - m.def("nest2ring_batch", &nest2ring_batch, "Convert nest to ring (CUDA) in batch"); - - m.def("nest2xy_batch", &nest2xy_batch, "Convert nest to xy (CUDA) in batch"); - m.def("xy2nest_batch", &xy2nest_batch, "Convert xy to nest (CUDA) in batch"); - - m.def("ring2xy_batch", &ring2xy_batch, "Convert ring to xy (CUDA) in batch"); - m.def("xy2ring_batch", &xy2ring_batch, "Convert xy to ring (CUDA) in batch"); - - m.def("xy2xy_batch", &xy2xy_batch, "Convert xy to xy (CUDA) in batch"); - -} +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "hpx_remapping.h" + +torch::Tensor ring2nest(torch::Tensor data_in_ring, const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_nest = torch::empty_like(data_in_ring); + + ring2nest_dispatch(data_in_ring, data_in_nest, nside, num_elements); + + return data_in_nest; +} + + +torch::Tensor nest2ring(torch::Tensor data_in_nest, const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_ring = torch::empty_like(data_in_nest); + + nest2ring_dispatch(data_in_nest, data_in_ring, nside, num_elements); + + return data_in_ring; +} + +torch::Tensor nest2xy(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_xy = torch::empty_like(data_in_nest); + + int src_origin = 0; + int dest_origin; + + bool src_clockwise = false; + bool dest_clockwise = clockwise; + + if (origin == "S"){ + dest_origin = 0; + } else if (origin == "E"){ + dest_origin = 1; + } else if (origin == "N"){ + dest_origin = 2; + } else if (origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + nest2xy_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_xy; +} + +torch::Tensor xy2nest(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_nest = torch::empty_like(data_in_xy); + + + int src_origin; + int dest_origin = 0; + + bool src_clockwise = clockwise; + bool dest_clockwise = false; + + if (origin == "S"){ + src_origin = 0; + } else if (origin == "E"){ + src_origin = 1; + } else if (origin == "N"){ + src_origin = 2; + } else if (origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + xy2nest_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_nest; +} + +torch::Tensor ring2xy(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_xy = torch::empty_like(data_in_ring); + + int src_origin = 0; + int dest_origin; + + bool src_clockwise = false; + bool dest_clockwise = clockwise; + + if (origin == "S"){ + dest_origin = 0; + } else if (origin == "E"){ + dest_origin = 1; + } else if (origin == "N"){ + dest_origin = 2; + } else if (origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + ring2xy_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_xy; +} + +torch::Tensor xy2ring(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_ring = torch::empty_like(data_in_xy); + + int src_origin; + int dest_origin = 0; + + bool src_clockwise = clockwise; + bool dest_clockwise = false; + + if (origin == "S"){ + src_origin = 0; + } else if (origin == "E"){ + src_origin = 1; + } else if (origin == "N"){ + src_origin = 2; + } else if (origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + xy2ring_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_ring; +} + +torch::Tensor xy2xy(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_xy_out = torch::empty_like(data_xy_in); + + int src_origin; + int dest_origin; + + + if (s_origin == "S"){ + src_origin = 0; + } else if (s_origin == "E"){ + src_origin = 1; + } else if (s_origin == "N"){ + src_origin = 2; + } else if (s_origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop"); + } + + if (d_origin == "S"){ + dest_origin = 0; + } else if (d_origin == "E"){ + dest_origin = 1; + } else if (d_origin == "N"){ + dest_origin = 2; + } else if (d_origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop"); + } + + + xy2xy_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_xy_out; +} + + +torch::Tensor ring2nest_batch(torch::Tensor data_in_ring, const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_nest = torch::empty_like(data_in_ring); + + ring2nest_batch_dispatch(data_in_ring, data_in_nest, nside, num_elements); + + return data_in_nest; +} + + +torch::Tensor nest2ring_batch(torch::Tensor data_in_nest, const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_ring = torch::empty_like(data_in_nest); + + nest2ring_batch_dispatch(data_in_nest, data_in_ring, nside, num_elements); + + return data_in_ring; +} + +torch::Tensor nest2xy_batch(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_xy = torch::empty_like(data_in_nest); + + int src_origin = 0; + int dest_origin; + + bool src_clockwise = false; + bool dest_clockwise = clockwise; + + if (origin == "S"){ + dest_origin = 0; + } else if (origin == "E"){ + dest_origin = 1; + } else if (origin == "N"){ + dest_origin = 2; + } else if (origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + nest2xy_batch_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_xy; +} + +torch::Tensor xy2nest_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_nest = torch::empty_like(data_in_xy); + + + int src_origin; + int dest_origin = 0; + + bool src_clockwise = clockwise; + bool dest_clockwise = false; + + if (origin == "S"){ + src_origin = 0; + } else if (origin == "E"){ + src_origin = 1; + } else if (origin == "N"){ + src_origin = 2; + } else if (origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + xy2nest_batch_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_nest; +} + +torch::Tensor ring2xy_batch(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_xy = torch::empty_like(data_in_ring); + + int src_origin = 0; + int dest_origin; + + bool src_clockwise = false; + bool dest_clockwise = clockwise; + + if (origin == "S"){ + dest_origin = 0; + } else if (origin == "E"){ + dest_origin = 1; + } else if (origin == "N"){ + dest_origin = 2; + } else if (origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + ring2xy_batch_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_xy; +} + +torch::Tensor xy2ring_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_ring = torch::empty_like(data_in_xy); + + int src_origin; + int dest_origin = 0; + + bool src_clockwise = clockwise; + bool dest_clockwise = false; + + if (origin == "S"){ + src_origin = 0; + } else if (origin == "E"){ + src_origin = 1; + } else if (origin == "N"){ + src_origin = 2; + } else if (origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + xy2ring_batch_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_ring; +} + +torch::Tensor xy2xy_batch(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_xy_out = torch::empty_like(data_xy_in); + + int src_origin; + int dest_origin; + + + if (s_origin == "S"){ + src_origin = 0; + } else if (s_origin == "E"){ + src_origin = 1; + } else if (s_origin == "N"){ + src_origin = 2; + } else if (s_origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop"); + } + + if (d_origin == "S"){ + dest_origin = 0; + } else if (d_origin == "E"){ + dest_origin = 1; + } else if (d_origin == "N"){ + dest_origin = 2; + } else if (d_origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop"); + } + + + xy2xy_batch_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_xy_out; +} + + + +PYBIND11_MODULE(cuhpx_remap, m) { + + m.def("ring2nest", &ring2nest, "Convert ring to nest (CUDA)"); + m.def("nest2ring", &nest2ring, "Convert nest to ring (CUDA)"); + + m.def("nest2xy", &nest2xy, "Convert nest to xy (CUDA)"); + m.def("xy2nest", &xy2nest, "Convert xy to nest (CUDA)"); + + m.def("ring2xy", &ring2xy, "Convert ring to xy (CUDA)"); + m.def("xy2ring", &xy2ring, "Convert xy to ring (CUDA)"); + + m.def("xy2xy", &xy2xy, "Convert xy to xy (CUDA)"); + + m.def("ring2nest_batch", &ring2nest_batch, "Convert ring to nest (CUDA) in batch"); + m.def("nest2ring_batch", &nest2ring_batch, "Convert nest to ring (CUDA) in batch"); + + m.def("nest2xy_batch", &nest2xy_batch, "Convert nest to xy (CUDA) in batch"); + m.def("xy2nest_batch", &xy2nest_batch, "Convert xy to nest (CUDA) in batch"); + + m.def("ring2xy_batch", &ring2xy_batch, "Convert ring to xy (CUDA) in batch"); + m.def("xy2ring_batch", &xy2ring_batch, "Convert xy to ring (CUDA) in batch"); + + m.def("xy2xy_batch", &xy2xy_batch, "Convert xy to xy (CUDA) in batch"); + +} + diff --git a/src/data_remapping/hpx_remapping.h b/src/data_remapping/hpx_remapping.h index 2edc77d..5ed936a 100644 --- a/src/data_remapping/hpx_remapping.h +++ b/src/data_remapping/hpx_remapping.h @@ -43,9 +43,6 @@ void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, cons void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise, const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements); -void benchmark_nest_ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements); - - void ring2nest_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements); void nest2ring_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements); @@ -66,3 +63,4 @@ void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, c const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements); #endif // HPX_REMAPPING_H + diff --git a/src/data_remapping/hpx_remapping_cuda.cu b/src/data_remapping/hpx_remapping_cuda.cu index 41cf8f0..eb0f677 100644 --- a/src/data_remapping/hpx_remapping_cuda.cu +++ b/src/data_remapping/hpx_remapping_cuda.cu @@ -133,15 +133,15 @@ __device__ inline void correct_xy_orient(int& ix, int& iy, const int order, cons iy = new_iy; } -template +template __global__ void rearrange_data_kernel_naive(const T* d_data_in, T* d_data_out, const I* d_pix_array, const size_t num_elements){ - + int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num_elements){ I pix = d_pix_array[tid]; d_data_out[pix] = d_data_in[tid]; - + } } @@ -157,7 +157,7 @@ template } -template +template __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array, int* face_num_array, const int order, const I nside, const size_t num_elements){ @@ -192,10 +192,10 @@ __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array I irm = 2*nside + 1 - tmp; I ifm = iphi - (ire >> 1) + nside - 1; I ifp = iphi - (irm >> 1) + nside - 1; - + ifm >>= order; ifp >>= order; - + face_num = (ifp == ifm) ? (ifp | 4) : ((ifp < ifm) ? ifp : (ifm + 8)); } else { // South Polar Cap I ip = npix - pix; @@ -224,8 +224,8 @@ __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array } -template -__global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array, +template +__global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array, const int order, const I nside, const size_t num_elements) { int tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -281,7 +281,7 @@ __global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy } } -template +template __global__ void nest2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array, int* face_num_array, const int order, const I nside, const size_t num_elements) { @@ -303,7 +303,7 @@ __global__ void nest2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array } } -template +template __global__ void xyf2nest_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array, const int order, const I nside, const size_t num_elements){ @@ -601,7 +601,7 @@ void ring2nest_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_ ring2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); rearrange_data_kernel_naive <<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, num_elements); - + //cudaDeviceSynchronize(); cudaFree(d_pix_array); @@ -633,7 +633,7 @@ void nest2ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); xyf2ring_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); rearrange_data_kernel_naive <<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, num_elements); - + //cudaDeviceSynchronize(); cudaFree(d_pix_array); @@ -643,170 +643,6 @@ void nest2ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ } -template -void benchmark_nest_ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) { - - int block_size = 1024; - int num_blocks = (num_elements + block_size - 1) / block_size; - - int order = compute_order(nside); - - int* d_pix_array; - int* d_ix_array; - int* d_iy_array; - int* d_face_num_array; - - cudaMalloc(&d_pix_array, num_elements * sizeof(int)); - cudaMalloc(&d_ix_array, num_elements * sizeof(int)); - cudaMalloc(&d_iy_array, num_elements * sizeof(int)); - cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); - - - float total_elapsed_time_r2n = 0.0f; - float total_elapsed_time_n2r = 0.0f; - - float total_elapsed_time_n2xy = 0.0f; - float total_elapsed_time_xy2n = 0.0f; - - float elapsed_time; - - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cudaStream_t stream; - cudaStreamCreate(&stream); - - int num_runs = 3; - int k = 2; - bool flip = false; - - // warm-up run - for (int i = 0; i < num_runs; ++i) - { - // nest 2 ring - initialize_pix_array<<>>(d_pix_array, num_elements); - nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - xyf2ring_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - rearrange_data_kernel_naive <<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, num_elements); - - // ring 2 nest - initialize_pix_array<<>>(d_pix_array, num_elements); - ring2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - rearrange_data_kernel_naive <<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, num_elements); - - // nest2xy - initialize_pix_array<<>>(d_pix_array, num_elements); - nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - xyf2flat_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); - rearrange_data_kernel_naive <<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, num_elements); - - // xy2nest - initialize_pix_array<<>>(d_pix_array, num_elements); - flat2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); - xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - rearrange_data_kernel_naive <<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, num_elements); - } - - cudaDeviceSynchronize(); - - num_runs = 10; - - - for (int i = 0; i < num_runs; ++i) - { - - // N2R - cudaEventRecord(start, stream); - - initialize_pix_array<<>>(d_pix_array, num_elements); - nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - xyf2ring_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - rearrange_data_kernel_naive <<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, num_elements); - - cudaEventRecord(stop, stream); - cudaEventSynchronize(stop); - - elapsed_time = 0.0f; - cudaEventElapsedTime(&elapsed_time, start, stop); - total_elapsed_time_n2r += elapsed_time; - - // R2N - - cudaEventRecord(start, stream); - - initialize_pix_array<<>>(d_pix_array, num_elements); - ring2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - rearrange_data_kernel_naive <<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, num_elements); - - cudaEventRecord(stop, stream); - cudaEventSynchronize(stop); - - elapsed_time = 0.0f; - cudaEventElapsedTime(&elapsed_time, start, stop); - total_elapsed_time_r2n += elapsed_time; - - // nest2xy - cudaEventRecord(start, stream); - - initialize_pix_array<<>>(d_pix_array, num_elements); - nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - xyf2flat_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); - rearrange_data_kernel_naive <<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, num_elements); - - cudaEventRecord(stop, stream); - cudaEventSynchronize(stop); - - elapsed_time = 0.0f; - cudaEventElapsedTime(&elapsed_time, start, stop); - total_elapsed_time_n2xy += elapsed_time; - - // xy2nest - cudaEventRecord(start, stream); - - initialize_pix_array<<>>(d_pix_array, num_elements); - flat2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); - xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - rearrange_data_kernel_naive <<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, num_elements); - - cudaEventRecord(stop, stream); - cudaEventSynchronize(stop); - - elapsed_time = 0.0f; - cudaEventElapsedTime(&elapsed_time, start, stop); - total_elapsed_time_xy2n += elapsed_time; - - } - - float average_elapsed_time_n2r = total_elapsed_time_n2r / num_runs; - float average_elapsed_time_r2n = total_elapsed_time_r2n / num_runs; - float average_elapsed_time_n2xy = total_elapsed_time_n2xy / num_runs; - float average_elapsed_time_xy2n = total_elapsed_time_xy2n / num_runs; - - - std::cout << "Average elapsed time used by CUDA data ring2nest kernel over " << num_runs << " runs: " << average_elapsed_time_r2n << "ms" << std::endl; - std::cout << "Average elapsed time used by CUDA data nest2ring kernel over " << num_runs << " runs: " << average_elapsed_time_n2r << "ms" << std::endl; - - std::cout << "Average elapsed time used by CUDA data nest2XY kernel over " << num_runs << " runs: " << average_elapsed_time_n2xy << "ms" << std::endl; - std::cout << "Average elapsed time used by CUDA data XY2nest kernel over " << num_runs << " runs: " << average_elapsed_time_xy2n << "ms" << std::endl; - - cudaEventDestroy(start); - cudaEventDestroy(stop); - cudaStreamDestroy(stream); - - cudaDeviceSynchronize(); - - - cudaFree(d_pix_array); - cudaFree(d_ix_array); - cudaFree(d_iy_array); - cudaFree(d_face_num_array); -} - - // Dispatch functions for each wrapper void ring2nest_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements) { @@ -825,11 +661,11 @@ void nest2ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, void nest2xy_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise, const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { - + AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "nest2xy", ([&] { nest2xy_kernel_wrapper(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2nest_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_nest, const int src_origin, const bool src_clockwise, @@ -847,7 +683,7 @@ void ring2xy_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_xy, cons AT_DISPATCH_ALL_TYPES(data_in_ring.scalar_type(), "ring2xy", ([&] { ring2xy_kernel_wrapper(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, const int src_origin, const bool src_clockwise, @@ -856,7 +692,7 @@ void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, cons AT_DISPATCH_ALL_TYPES(data_in_xy.scalar_type(), "xy2ring", ([&] { xy2ring_kernel_wrapper(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise, @@ -865,61 +701,51 @@ void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const i AT_DISPATCH_ALL_TYPES(data_xy_in.scalar_type(), "xy2xy", ([&] { xy2xy_kernel_wrapper(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } -void benchmark_nest_ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) { - - AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "benchmark_nest_ring", ([&] { - benchmark_nest_ring_kernel_wrapper(data_in_nest, data_in_ring, nside, num_elements); - })); - -} - - - template -__global__ void rearrange_data_kernel_3d_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, +__global__ void rearrange_data_kernel_3d_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, const size_t m, const size_t n, const size_t num_elements) { - + // d_data_in: m by n by num_elements 3D matrix int idz = blockIdx.z * blockDim.z + threadIdx.z; int idy = blockIdx.y * blockDim.y + threadIdx.y; int idx = blockIdx.x * blockDim.x + threadIdx.x; - + int tid = idz * n * num_elements + idy * num_elements + idx; - + if (idz < m && idy < n && idx < num_elements) { - I pix = d_pix_array[idx]; + I pix = d_pix_array[idx]; d_data_out[tid-idx + pix] = d_data_in[tid]; } } template -__global__ void rearrange_data_kernel_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, +__global__ void rearrange_data_kernel_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, const size_t n, const size_t num_elements) { unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - + unsigned int tid = idy * num_elements + idx; - + if (idy < n && idx < num_elements) { - I pix = d_pix_array[idx]; + I pix = d_pix_array[idx]; d_data_out[tid-idx + pix] = d_data_in[tid]; } } template -void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out, +void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, + const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -965,9 +791,9 @@ void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_ template -void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy, +void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, + const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -1022,9 +848,9 @@ void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data } template -void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest, +void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest, const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, + const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -1079,9 +905,9 @@ void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_i } template -void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy, +void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, + const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -1137,9 +963,9 @@ void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data template -void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring, +void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring, const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, + const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -1195,7 +1021,7 @@ void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_i template -void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest, +void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -1244,7 +1070,7 @@ void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor da } template -void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, +void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -1292,372 +1118,6 @@ void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor da cudaFree(d_face_num_array); } -/* -template -void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out, - const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, - const int nside, const size_t num_elements) { - - // Check tensor dimensions and sizes - TORCH_CHECK(data_xy_in.dim() == 3, "data_xy_in must be a 3D tensor"); - TORCH_CHECK(data_xy_out.dim() == 3, "data_xy_out must be a 3D tensor"); - - int m = data_xy_in.size(0); // First dimension - int n = data_xy_in.size(1); // Second dimension - - int order = compute_order(nside); - - int block_size = 256; - int num_blocks = (num_elements + block_size - 1)/ block_size; - - // Calculate block and grid dimensions - dim3 block_dim(32, 4, 4); // Example 3D block dimensions - dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, - (n + block_dim.y - 1) / block_dim.y, - (m + block_dim.z - 1) / block_dim.z); - - // Allocate device memory - int* d_pix_array; - - cudaMalloc(&d_pix_array, num_elements * sizeof(int)); - - // Compute rotation parameters - bool flip = src_clockwise != dest_clockwise; - int rotations = dest_origin - src_origin; - int k = (dest_clockwise ? -rotations : rotations) & 3; - if (k < 0) { k += 4; } - - // Launch kernels - initialize_pix_array<<>>(d_pix_array, num_elements); - flat2flat_kernel<<>>(d_pix_array, order, nside, num_elements, flip, k); - - // Adjust kernel launch for 3D data - rearrange_data_kernel_3d_batch<<>>(data_xy_in.data_ptr(), data_xy_out.data_ptr(), d_pix_array, m, n, num_elements); - - // Free device memory - cudaFree(d_pix_array); -} - - -template -void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy, - const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, - const int nside, const size_t num_elements) { - - // Check tensor dimensions and sizes - TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest should be a 3D tensor"); - TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy should be a 3D tensor"); - - int m = data_in_nest.size(0); // First dimension - int n = data_in_nest.size(1); // Second dimension - - int order = compute_order(nside); - - // Calculate block and grid dimensions - dim3 block_dim(32, 4, 4); // Example 3D block dimensions - dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, - (n + block_dim.y - 1) / block_dim.y, - (m + block_dim.z - 1) / block_dim.z); - - int block_size = 256; - int num_blocks = (num_elements + block_size - 1)/ block_size; - - // Allocate device memory - int* d_pix_array; - int* d_ix_array; - int* d_iy_array; - int* d_face_num_array; - - cudaMalloc(&d_pix_array, num_elements * sizeof(int)); - cudaMalloc(&d_ix_array, num_elements * sizeof(int)); - cudaMalloc(&d_iy_array, num_elements * sizeof(int)); - cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); - - // Compute rotation parameters - bool flip = src_clockwise != dest_clockwise; - int rotations = dest_origin - src_origin; - int k = (dest_clockwise ? -rotations : rotations) & 3; - if (k < 0) { k += 4; } - - // Launch kernels - initialize_pix_array<<>>(d_pix_array, num_elements); - nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - xyf2flat_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); - - // Adjust kernel launch for 3D data - rearrange_data_kernel_3d_batch<<>>(data_in_nest.data_ptr(), data_in_xy.data_ptr(), d_pix_array, m, n, num_elements); - - // Free device memory - cudaFree(d_pix_array); - cudaFree(d_ix_array); - cudaFree(d_iy_array); - cudaFree(d_face_num_array); -} - -template -void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest, - const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, - const int nside, const size_t num_elements) { - - // Check tensor dimensions and sizes - TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor"); - TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor"); - - int m = data_in_xy.size(0); // First dimension - int n = data_in_xy.size(1); // Second dimension - - int order = compute_order(nside); - - int block_size = 256; - int num_blocks = (num_elements + block_size - 1)/ block_size; - - // Calculate block and grid dimensions - dim3 block_dim(32, 4, 4); // Example 3D block dimensions - dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, - (n + block_dim.y - 1) / block_dim.y, - (m + block_dim.z - 1) / block_dim.z); - - // Allocate device memory - int* d_pix_array; - int* d_ix_array; - int* d_iy_array; - int* d_face_num_array; - - cudaMalloc(&d_pix_array, num_elements * sizeof(int)); - cudaMalloc(&d_ix_array, num_elements * sizeof(int)); - cudaMalloc(&d_iy_array, num_elements * sizeof(int)); - cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); - - // Compute rotation parameters - bool flip = src_clockwise != dest_clockwise; - int rotations = dest_origin - src_origin; - int k = (dest_clockwise ? -rotations : rotations) & 3; - if (k < 0) { k += 4; } - - // Launch kernels - initialize_pix_array<<>>(d_pix_array, num_elements); - flat2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); - xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - - // Adjust kernel launch for 3D data - rearrange_data_kernel_3d_batch<<>>(data_in_xy.data_ptr(), data_in_nest.data_ptr(), d_pix_array, m, n, num_elements); - - // Free device memory - cudaFree(d_pix_array); - cudaFree(d_ix_array); - cudaFree(d_iy_array); - cudaFree(d_face_num_array); -} - -template -void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy, - const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, - const int nside, const size_t num_elements) { - - // Check tensor dimensions and sizes - TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor"); - TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor"); - - int m = data_in_ring.size(0); // First dimension - int n = data_in_ring.size(1); // Second dimension - int order = compute_order(nside); - - int block_size = 256; - int num_blocks = (num_elements + block_size - 1)/ block_size; - - // Calculate block and grid dimensions - dim3 block_dim(32, 4, 4); // Example 3D block dimensions - dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, - (n + block_dim.y - 1) / block_dim.y, - (m + block_dim.z - 1) / block_dim.z); - - // Allocate device memory - int* d_pix_array; - int* d_ix_array; - int* d_iy_array; - int* d_face_num_array; - - cudaMalloc(&d_pix_array, num_elements * sizeof(int)); - cudaMalloc(&d_ix_array, num_elements * sizeof(int)); - cudaMalloc(&d_iy_array, num_elements * sizeof(int)); - cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); - - // Compute rotation parameters - bool flip = src_clockwise != dest_clockwise; - int rotations = dest_origin - src_origin; - int k = (dest_clockwise ? -rotations : rotations) & 3; - if (k < 0) { k += 4; } - - // Launch kernels - initialize_pix_array<<>>(d_pix_array, num_elements); - ring2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - xyf2flat_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); - - // Adjust kernel launch for 3D data - rearrange_data_kernel_3d_batch<<>>(data_in_ring.data_ptr(), data_in_xy.data_ptr(), d_pix_array, m, n, num_elements); - - // Free device memory - cudaFree(d_pix_array); - cudaFree(d_ix_array); - cudaFree(d_iy_array); - cudaFree(d_face_num_array); -} - - -template -void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring, - const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, - const int nside, const size_t num_elements) { - - // Check tensor dimensions and sizes - TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor"); - TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor"); - - int m = data_in_xy.size(0); // First dimension - int n = data_in_xy.size(1); // Second dimension - int order = compute_order(nside); - - int block_size = 256; - int num_blocks = (num_elements + block_size - 1)/ block_size; - - // Calculate block and grid dimensions - dim3 block_dim(32, 4, 4); // Example 3D block dimensions - dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, - (n + block_dim.y - 1) / block_dim.y, - (m + block_dim.z - 1) / block_dim.z); - - // Allocate device memory - int* d_pix_array; - int* d_ix_array; - int* d_iy_array; - int* d_face_num_array; - - cudaMalloc(&d_pix_array, num_elements * sizeof(int)); - cudaMalloc(&d_ix_array, num_elements * sizeof(int)); - cudaMalloc(&d_iy_array, num_elements * sizeof(int)); - cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); - - // Compute rotation parameters - bool flip = src_clockwise != dest_clockwise; - int rotations = dest_origin - src_origin; - int k = (dest_clockwise ? -rotations : rotations) & 3; - if (k < 0) { k += 4; } - - // Launch kernels - initialize_pix_array<<>>(d_pix_array, num_elements); - flat2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); - xyf2ring_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - - // Adjust kernel launch for 3D data - rearrange_data_kernel_3d_batch<<>>(data_in_xy.data_ptr(), data_in_ring.data_ptr(), d_pix_array, m, n, num_elements); - - // Free device memory - cudaFree(d_pix_array); - cudaFree(d_ix_array); - cudaFree(d_iy_array); - cudaFree(d_face_num_array); -} - - -template -void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest, - const int nside, const size_t num_elements) { - - // Check tensor dimensions and sizes - TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor"); - TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor"); - - int m = data_in_ring.size(0); // First dimension - int n = data_in_ring.size(1); // Second dimension - int order = compute_order(nside); - - int block_size = 256; - int num_blocks = (num_elements + block_size - 1)/ block_size; - - // Calculate block and grid dimensions - dim3 block_dim(32, 4, 4); // Example 3D block dimensions - dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, - (n + block_dim.y - 1) / block_dim.y, - (m + block_dim.z - 1) / block_dim.z); - - // Allocate device memory - int* d_pix_array; - int* d_ix_array; - int* d_iy_array; - int* d_face_num_array; - - cudaMalloc(&d_pix_array, num_elements * sizeof(int)); - cudaMalloc(&d_ix_array, num_elements * sizeof(int)); - cudaMalloc(&d_iy_array, num_elements * sizeof(int)); - cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); - - // Launch kernels - initialize_pix_array<<>>(d_pix_array, num_elements); - ring2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - - // Adjust kernel launch for 3D data - rearrange_data_kernel_3d_batch<<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, m, n, num_elements); - - // Free device memory - cudaFree(d_pix_array); - cudaFree(d_ix_array); - cudaFree(d_iy_array); - cudaFree(d_face_num_array); -} - -template -void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, - const int nside, const size_t num_elements) { - - // Check tensor dimensions and sizes - TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor"); - TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor"); - - int m = data_in_nest.size(0); // First dimension - int n = data_in_nest.size(1); // Second dimension - int order = compute_order(nside); - - int block_size = 256; - int num_blocks = (num_elements + block_size - 1)/ block_size; - - // Calculate block and grid dimensions - dim3 block_dim(32, 4, 4); // Example 3D block dimensions - dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, - (n + block_dim.y - 1) / block_dim.y, - (m + block_dim.z - 1) / block_dim.z); - - // Allocate device memory - int* d_pix_array; - int* d_ix_array; - int* d_iy_array; - int* d_face_num_array; - - cudaMalloc(&d_pix_array, num_elements * sizeof(int)); - cudaMalloc(&d_ix_array, num_elements * sizeof(int)); - cudaMalloc(&d_iy_array, num_elements * sizeof(int)); - cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); - - // Launch kernels - initialize_pix_array<<>>(d_pix_array, num_elements); - nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - xyf2ring_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); - - // Adjust kernel launch for 3D data - rearrange_data_kernel_3d_batch<<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, m, n, num_elements); - - // Free device memory - cudaFree(d_pix_array); - cudaFree(d_ix_array); - cudaFree(d_iy_array); - cudaFree(d_face_num_array); -} -*/ // Dispatch functions for each wrapper void ring2nest_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements) { @@ -1677,11 +1137,11 @@ void nest2ring_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ void nest2xy_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise, const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { - + AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "nest2xy_batch", ([&] { nest2xy_batch_kernel_wrapper(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2nest_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_nest, const int src_origin, const bool src_clockwise, @@ -1699,7 +1159,7 @@ void ring2xy_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_xy AT_DISPATCH_ALL_TYPES(data_in_ring.scalar_type(), "ring2xy_batch", ([&] { ring2xy_batch_kernel_wrapper(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2ring_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, const int src_origin, const bool src_clockwise, @@ -1708,7 +1168,7 @@ void xy2ring_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring AT_DISPATCH_ALL_TYPES(data_in_xy.scalar_type(), "xy2ring_batch", ([&] { xy2ring_batch_kernel_wrapper(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise, @@ -1717,5 +1177,6 @@ void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, c AT_DISPATCH_ALL_TYPES(data_xy_in.scalar_type(), "xy2xy_batch", ([&] { xy2xy_batch_kernel_wrapper(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } + diff --git a/src/harmonic_transform/hpx_fft.cpp b/src/harmonic_transform/hpx_fft.cpp index 90c8af1..8531caf 100644 --- a/src/harmonic_transform/hpx_fft.cpp +++ b/src/harmonic_transform/hpx_fft.cpp @@ -421,11 +421,6 @@ torch::Tensor healpix_irfft_cuda(torch::Tensor ftm, int L, int nside) { } -// PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { -// m.def("healpix_rfft_cuda", &healpix_rfft_cuda, "HEALPix RFFT with cuFFT and CUDA"); -// m.def("healpix_irfft_cuda", &healpix_irfft_cuda, "HEALPix IRFFT with cuFFT and CUDA"); -// } - int nphi_ring(int t, int nside); int cumulative_nphi_ring(int t, int nside); double p2phi_ring(int t, int offset, int nside); From beb7519ea4f1d34728d08a621e083807c099c04d Mon Sep 17 00:00:00 2001 From: xiaopoc Date: Mon, 2 Jun 2025 11:52:17 -0700 Subject: [PATCH 3/6] Clean up --- cuhpx/hpx_remap.py | 3 +-- cuhpx/hpx_sht.py | 31 +------------------------------ 2 files changed, 2 insertions(+), 32 deletions(-) diff --git a/cuhpx/hpx_remap.py b/cuhpx/hpx_remap.py index a1adb05..0bd6612 100644 --- a/cuhpx/hpx_remap.py +++ b/cuhpx/hpx_remap.py @@ -13,9 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cuhpx_remap import torch - +from . import cuhpx_remap def is_power_of_two(n): return (n > 0) and (n & (n - 1)) == 0 diff --git a/cuhpx/hpx_sht.py b/cuhpx/hpx_sht.py index bb77bb2..eec5ce1 100644 --- a/cuhpx/hpx_sht.py +++ b/cuhpx/hpx_sht.py @@ -13,10 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cuhpx_fft +from . import cuhpx_fft import numpy as np import torch -import torch.cuda.nvtx import torch.nn as nn from torch.autograd import Function @@ -449,13 +448,10 @@ def einsum_with_chunking(x, weights, mmax, xout, nchunk, stream1): device = torch.device("cuda") chunk_size = int(weights.size(1) / nchunk + 1) # Adjust this based on your memory constraints - torch.cuda.nvtx.range_push("Allocate memory for chunk") next_chunk_cpu = torch.empty((weights.size(0), chunk_size, weights.size(2)), dtype=weights.dtype, pin_memory=True) current_chunk = torch.empty((weights.size(0), chunk_size, weights.size(2)), dtype=weights.dtype, device=device) next_chunk = torch.empty_like(current_chunk) - torch.cuda.nvtx.range_pop() - torch.cuda.nvtx.range_push("einsum between x and weights with chunking") # Create events for synchronization event_transfer = torch.cuda.Event(blocking=True) @@ -475,40 +471,30 @@ def einsum_with_chunking(x, weights, mmax, xout, nchunk, stream1): if actual_chunk_size != chunk_size: next_chunk_cpu.resize_((weights.size(0), actual_chunk_size, weights.size(2))) - torch.cuda.nvtx.range_push("CPU copy from weights to pin memory") next_chunk_cpu.copy_(weights[:, start_i:end_i, :]) - torch.cuda.nvtx.range_pop() with torch.cuda.stream(stream1): - torch.cuda.nvtx.range_push(f"Transfer weights chunk {i}:{end_i} to GPU") next_chunk[: weights.size(0), : end_i - start_i, :].copy_(next_chunk_cpu, non_blocking=True) event_transfer.record(stream1) - torch.cuda.nvtx.range_pop() - torch.cuda.nvtx.range_push(f"Compute einsum for chunk {i - chunk_size}:{end_i - chunk_size}") xout[..., start_j:end_j, :, :] = torch.einsum( '...kmn,mlk->...lmn', x, current_chunk[:, : end_j - start_j, :].to(x.dtype) ) event_computation.record(torch.cuda.current_stream()) - torch.cuda.nvtx.range_pop() torch.cuda.current_stream().wait_event(event_transfer) current_chunk, next_chunk = next_chunk, current_chunk start_j, end_j = start_i, end_i if start_i < weights.size(1): - torch.cuda.nvtx.range_push("Compute einsum for the last chunk") xout[..., start_i:end_i, :, :] = torch.einsum( '...kmn,mlk->...lmn', x, current_chunk[:, : end_i - start_i, :].to(x.dtype) ) - torch.cuda.nvtx.range_pop() stream1.synchronize() torch.cuda.current_stream().synchronize() - torch.cuda.nvtx.range_pop() # End of einsum with chunking - return xout @@ -523,39 +509,28 @@ def forward(ctx, x, weights, pct, W, mmax, lmax, nside): ctx.lmax = lmax ctx.nside = nside - torch.cuda.nvtx.range_push("rfft") # SHT if x.dim() == 1: x = cuhpx_fft.healpix_rfft_class(x, mmax, nside) else: x = cuhpx_fft.healpix_rfft_batch(x, mmax, nside) - torch.cuda.nvtx.range_pop() - x = torch.view_as_real(x) out_shape = list(x.size()) out_shape[-3] = lmax out_shape[-2] = mmax - torch.cuda.nvtx.range_push("allocate xout") xout = torch.zeros(out_shape, dtype=x.dtype, device=x.device) - torch.cuda.nvtx.range_pop() - torch.cuda.nvtx.range_push("einsum between pct and weights") weights = pct * weights - torch.cuda.nvtx.range_pop() if not pct.is_cuda: - torch.cuda.nvtx.range_push("einsum between x and weights using two stream") nchunk = 12 stream1 = torch.cuda.Stream() xout = einsum_with_chunking(x, weights, mmax, xout, nchunk, stream1) - torch.cuda.nvtx.range_pop() else: - torch.cuda.nvtx.range_push("einsum between x and weights") xout = torch.einsum('...kmn,mlk->...lmn', x, weights.to(x.dtype)) - torch.cuda.nvtx.range_pop() x = torch.view_as_complex(xout.contiguous()) @@ -595,18 +570,14 @@ def forward(ctx, x, weights, pct, W, mmax, lmax, nside): x = torch.view_as_real(x) - torch.cuda.nvtx.range_push("einsum between x and pct") xs = torch.einsum('...lmn, mlk->...kmn', x, pct.to(x.dtype)) - torch.cuda.nvtx.range_pop() x = torch.view_as_complex(xs.contiguous()) - torch.cuda.nvtx.range_push("irfft") if x.dim() == 2: x = cuhpx_fft.healpix_irfft_class(x, mmax, nside) else: x = cuhpx_fft.healpix_irfft_batch(x, mmax, nside) - torch.cuda.nvtx.range_pop() return x From 5680410c77675703a8f50894873b0cce40fc387f Mon Sep 17 00:00:00 2001 From: root Date: Tue, 5 Aug 2025 01:16:40 +0000 Subject: [PATCH 4/6] fix bug for build tools --- src/harmonic_transform/hpx_fft.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/harmonic_transform/hpx_fft.cpp b/src/harmonic_transform/hpx_fft.cpp index 8531caf..ddacfd2 100644 --- a/src/harmonic_transform/hpx_fft.cpp +++ b/src/harmonic_transform/hpx_fft.cpp @@ -15,6 +15,7 @@ * limitations under the License. */ +#include #include "hpx_fft.h" #include @@ -666,7 +667,7 @@ torch::Tensor healpix_irfft(torch::Tensor ftm, int L, int nside) { return f; } -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { +PYBIND11_MODULE(cuhpx_fft, m) { m.def("healpix_rfft", &healpix_rfft, "HEALPix RFFT"); m.def("healpix_irfft", &healpix_irfft, "HEALPix IRFFT"); m.def("healpix_rfft_cufft", &healpix_rfft_cufft, "HEALPix RFFT with cuFFT"); From acb92bc2d9ca77c5dfb47f820988b4fc458258ca Mon Sep 17 00:00:00 2001 From: xiaopoc Date: Mon, 11 Aug 2025 16:47:18 -0700 Subject: [PATCH 5/6] add CMakeLists.txt --- CMakeLists.txt | 111 ++- cuhpx/hpx_remap.py | 3 +- cuhpx/hpx_sht.py | 2 + pyproject.toml | 39 +- setup.py | 55 -- src/data_remapping/hpx_remapping.cpp | 946 ++++++++++++----------- src/data_remapping/hpx_remapping.h | 4 +- src/data_remapping/hpx_remapping_cuda.cu | 627 +++++++++++++-- src/harmonic_transform/hpx_fft.cpp | 6 +- 9 files changed, 1174 insertions(+), 619 deletions(-) delete mode 100644 setup.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 99a9700..55d0226 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,69 +1,106 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR) project(cuhpx LANGUAGES CXX CUDA) -# C++ and CUDA standard +# ---------- Basics ---------- set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +# LTO/IPO can drop CUDA fatbins; keep it off for these extensions +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF) -# Find Python and determine site-packages path +# CUDA archs default; override via -DCMAKE_CUDA_ARCHITECTURES=80;86;90 +if(NOT CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "OFF") + set(CMAKE_CUDA_ARCHITECTURES 90) # Hopper default +endif() +message(STATUS "✅ cuhpx build configured. CUDA archs: ${CMAKE_CUDA_ARCHITECTURES}") + +# ---------- Python / Torch ---------- find_package(Python REQUIRED COMPONENTS Interpreter Development.Module) + +# Find site-packages so we can locate Torch's CMake configs execute_process( - COMMAND "${Python_EXECUTABLE}" -c "import sysconfig; print(sysconfig.get_paths()['purelib'])" - OUTPUT_VARIABLE PYTHON_SITE_PACKAGES - OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND "${Python_EXECUTABLE}" -c "import sysconfig; print(sysconfig.get_paths()['purelib'])" + OUTPUT_VARIABLE PYTHON_SITE_PACKAGES + OUTPUT_STRIP_TRAILING_WHITESPACE ) - -# Add PyTorch CMake path list(APPEND CMAKE_PREFIX_PATH "${PYTHON_SITE_PACKAGES}/torch/share/cmake") -# Find required packages -find_package(Torch REQUIRED) +find_package(Torch REQUIRED) # ${TORCH_LIBRARIES}, ${TORCH_CXX_FLAGS} find_package(pybind11 REQUIRED) -find_package(CUDA REQUIRED) -# Optional: Locate libtorch_python.so dynamically -set(TORCH_PYTHON_LIB "${PYTHON_SITE_PACKAGES}/torch/lib/libtorch_python.so") +# ---------- CUDA toolchain (modern imported targets) ---------- +find_package(CUDAToolkit REQUIRED) # provides CUDA::cudart, etc. -# Set CUDA architecture (change if needed) -set(CMAKE_CUDA_ARCHITECTURES 90) -list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr --expt-extended-lambda -O3) +# Optional: Torch's Python shim (resolves at::Tensor pybind casters) +find_library(TORCH_PYTHON_LIBRARY + NAMES torch_python + HINTS "${PYTHON_SITE_PACKAGES}/torch/lib" "${TORCH_INSTALL_PREFIX}/lib" +) +if(TORCH_PYTHON_LIBRARY) + message(STATUS "Found torch_python at: ${TORCH_PYTHON_LIBRARY}") +endif() -# Include paths include_directories( - ${TORCH_INCLUDE_DIRS} - ${CUDA_INCLUDE_DIRS} - ${CMAKE_CURRENT_SOURCE_DIR}/src + ${TORCH_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/src ) +# ---------- Helper: apply safe CUDA flags per target ---------- +function(cuhpx_apply_cuda_flags target_name) + target_compile_options(${target_name} PRIVATE + $<$:--expt-relaxed-constexpr> + $<$:--expt-extended-lambda> + $<$:-Xcudafe> + $<$:--diag_suppress=20014> + ) + set_target_properties(${target_name} PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + INTERPROCEDURAL_OPTIMIZATION FALSE + ) +endfunction() + # ==================== cuhpx_fft ==================== set(CUHPX_FFT_SRC - src/harmonic_transform/hpx_fft.cpp - src/harmonic_transform/hpx_fft_cuda.cu + src/harmonic_transform/hpx_fft.cpp + src/harmonic_transform/hpx_fft_cuda.cu ) - pybind11_add_module(cuhpx_fft MODULE ${CUHPX_FFT_SRC}) -target_link_libraries(cuhpx_fft PRIVATE ${TORCH_LIBRARIES} ${CUDA_LIBRARIES}) -target_link_libraries(cuhpx_fft PRIVATE ${TORCH_PYTHON_LIB}) target_compile_definitions(cuhpx_fft PRIVATE ${TORCH_CXX_FLAGS}) -set_target_properties(cuhpx_fft PROPERTIES PREFIX "" LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx) +target_link_libraries(cuhpx_fft PRIVATE + ${TORCH_LIBRARIES} + CUDA::cudart + Python::Module +) +if(TORCH_PYTHON_LIBRARY) + target_link_libraries(cuhpx_fft PRIVATE ${TORCH_PYTHON_LIBRARY}) +endif() +set_target_properties(cuhpx_fft PROPERTIES PREFIX "" OUTPUT_NAME "cuhpx_fft") +cuhpx_apply_cuda_flags(cuhpx_fft) # ==================== cuhpx_remap ==================== set(CUHPX_REMAP_SRC - src/data_remapping/hpx_remapping.cpp - src/data_remapping/hpx_remapping_cuda.cu + src/data_remapping/hpx_remapping.cpp + src/data_remapping/hpx_remapping_cuda.cu ) - pybind11_add_module(cuhpx_remap MODULE ${CUHPX_REMAP_SRC}) -target_link_libraries(cuhpx_remap PRIVATE ${TORCH_LIBRARIES} ${CUDA_LIBRARIES}) -target_link_libraries(cuhpx_remap PRIVATE ${TORCH_PYTHON_LIB}) target_compile_definitions(cuhpx_remap PRIVATE ${TORCH_CXX_FLAGS}) -set_target_properties(cuhpx_remap PROPERTIES PREFIX "" LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx) - -# ==================== Install ==================== -install(TARGETS cuhpx_fft cuhpx_remap DESTINATION cuhpx) -install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx/data DESTINATION cuhpx FILES_MATCHING PATTERN "*.fits") - -message(STATUS "✅ cuhpx build system configured successfully.") +target_link_libraries(cuhpx_remap PRIVATE + ${TORCH_LIBRARIES} + CUDA::cudart + Python::Module +) +if(TORCH_PYTHON_LIBRARY) + target_link_libraries(cuhpx_remap PRIVATE ${TORCH_PYTHON_LIBRARY}) +endif() +set_target_properties(cuhpx_remap PROPERTIES PREFIX "" OUTPUT_NAME "cuhpx_remap") +cuhpx_apply_cuda_flags(cuhpx_remap) +# ---------- Install layout for wheels ---------- +# scikit-build-core will install these into site-packages/cuhpx/ for wheels. +install(TARGETS cuhpx_fft cuhpx_remap LIBRARY DESTINATION cuhpx) +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cuhpx/data + DESTINATION cuhpx + FILES_MATCHING PATTERN "*.fits" +) diff --git a/cuhpx/hpx_remap.py b/cuhpx/hpx_remap.py index 0bd6612..4090424 100644 --- a/cuhpx/hpx_remap.py +++ b/cuhpx/hpx_remap.py @@ -13,8 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch from . import cuhpx_remap +import torch + def is_power_of_two(n): return (n > 0) and (n & (n - 1)) == 0 diff --git a/cuhpx/hpx_sht.py b/cuhpx/hpx_sht.py index eec5ce1..89629d8 100644 --- a/cuhpx/hpx_sht.py +++ b/cuhpx/hpx_sht.py @@ -495,6 +495,7 @@ def einsum_with_chunking(x, weights, mmax, xout, nchunk, stream1): stream1.synchronize() torch.cuda.current_stream().synchronize() + return xout @@ -515,6 +516,7 @@ def forward(ctx, x, weights, pct, W, mmax, lmax, nside): else: x = cuhpx_fft.healpix_rfft_batch(x, mmax, nside) + x = torch.view_as_real(x) out_shape = list(x.size()) diff --git a/pyproject.toml b/pyproject.toml index e4454d1..326b0fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,21 +1,24 @@ [build-system] -requires = ["torch", "setuptools"] -build-backend = "setuptools.build_meta" +requires = ["scikit-build-core>=0.8", "pybind11>=2.12", "numpy"] +build-backend = "scikit_build_core.build" [project] name = "cuHPX" -version = "2025.5.1" -description = "GPU-accelerated utilities for data on HEALPix grids." -readme = "README.md" -license = { file="LICENSE.txt" } +version = "2025.8.1" +description = "CUDA-accelerated HEALPix tools for harmonic transforms and remapping" authors = [ - { name = "NVIDIA", email = "asubramaniam@nvidia.com" } + { name = "Xiaopo Cheng", email = "xiaopoc@nvidia.com" }, + { name = "Akshay Subramaniam", email = "asubramaniam@nvidia.com" } ] +readme = "README.md" +license = { file = "LICENSE.txt" } +requires-python = ">=3.8" dependencies = [ - "numpy", - "torch>=2.0.0", - "astropy", - "torch_harmonics", + "numpy", + "astropy", + "torch_harmonics" + # Expect PyTorch preinstalled in the environment; if you want to enforce: + # "torch>=2.4", ] classifiers = [ "Development Status :: 2 - Pre-Alpha", @@ -32,6 +35,18 @@ classifiers = [ [project.urls] "Homepage" = "https://github.com/NVlabs/cuHPX" +[tool.scikit-build] +wheel.packages = ["cuhpx"] +cmake.minimum-version = "3.18" +cmake.source-dir = "." +# Keep build artifacts in build/, not alongside sources +build-dir = "build/{wheel_tag}" +sdist.include = ["src", "cuhpx", "tests", "CMakeLists.txt", "LICENSE.txt", "README.md"] + +[tool.scikit-build.editable] +# Key: makes editable installs load extensions from build/ via a redirect hook +mode = "redirect" +verbose = true [tool.black] line-length = 120 @@ -115,4 +130,4 @@ target-version = 'py38' "S101", # asserts allowed in tests... "ARG", # Unused function args -> fixtures nevertheless are functionally relevant... "FBT", # Don't care about booleans as positional arguments in tests, e.g. via @pytest.mark.parametrize() -] +] \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 4465e59..0000000 --- a/setup.py +++ /dev/null @@ -1,55 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from setuptools import find_packages, setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -setup( - name='cuhpx', - version='0.1.0', - packages=find_packages(), - package_data={'cuhpx': ['**/*.fits']}, - url='https://gitlab-master.nvidia.com/Devtech-Compute/cuhpx', - license='TBD', - author='Xiaopo Cheng, Akshay Subramaniam', - author_email='xiaopoc@nvidia.com, asubramaniam@nvidia.com', - description='A library for performing transformations and analysis on HEALPix', - install_requires=[ - 'numpy', - 'torch', - 'astropy', - 'torch_harmonics', - ], - ext_modules=[ - CUDAExtension( - 'cuhpx_remap', - sources=[ - 'src/data_remapping/hpx_remapping.cpp', - 'src/data_remapping/hpx_remapping_cuda.cu', - ], - extra_compile_args={'nvcc': ['-O2']}, - ), - CUDAExtension( - 'cuhpx_fft', - sources=[ - 'src/harmonic_transform/hpx_fft.cpp', - 'src/harmonic_transform/hpx_fft_cuda.cu', - ], - extra_compile_args={'nvcc': ['-O2', '-lineinfo']}, - ), - ], - cmdclass={'build_ext': BuildExtension}, -) diff --git a/src/data_remapping/hpx_remapping.cpp b/src/data_remapping/hpx_remapping.cpp index aaff451..632d1e1 100644 --- a/src/data_remapping/hpx_remapping.cpp +++ b/src/data_remapping/hpx_remapping.cpp @@ -1,468 +1,478 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include "hpx_remapping.h" - -torch::Tensor ring2nest(torch::Tensor data_in_ring, const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_nest = torch::empty_like(data_in_ring); - - ring2nest_dispatch(data_in_ring, data_in_nest, nside, num_elements); - - return data_in_nest; -} - - -torch::Tensor nest2ring(torch::Tensor data_in_nest, const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_ring = torch::empty_like(data_in_nest); - - nest2ring_dispatch(data_in_nest, data_in_ring, nside, num_elements); - - return data_in_ring; -} - -torch::Tensor nest2xy(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_xy = torch::empty_like(data_in_nest); - - int src_origin = 0; - int dest_origin; - - bool src_clockwise = false; - bool dest_clockwise = clockwise; - - if (origin == "S"){ - dest_origin = 0; - } else if (origin == "E"){ - dest_origin = 1; - } else if (origin == "N"){ - dest_origin = 2; - } else if (origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - nest2xy_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_xy; -} - -torch::Tensor xy2nest(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_nest = torch::empty_like(data_in_xy); - - - int src_origin; - int dest_origin = 0; - - bool src_clockwise = clockwise; - bool dest_clockwise = false; - - if (origin == "S"){ - src_origin = 0; - } else if (origin == "E"){ - src_origin = 1; - } else if (origin == "N"){ - src_origin = 2; - } else if (origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - xy2nest_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_nest; -} - -torch::Tensor ring2xy(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_xy = torch::empty_like(data_in_ring); - - int src_origin = 0; - int dest_origin; - - bool src_clockwise = false; - bool dest_clockwise = clockwise; - - if (origin == "S"){ - dest_origin = 0; - } else if (origin == "E"){ - dest_origin = 1; - } else if (origin == "N"){ - dest_origin = 2; - } else if (origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - ring2xy_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_xy; -} - -torch::Tensor xy2ring(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_ring = torch::empty_like(data_in_xy); - - int src_origin; - int dest_origin = 0; - - bool src_clockwise = clockwise; - bool dest_clockwise = false; - - if (origin == "S"){ - src_origin = 0; - } else if (origin == "E"){ - src_origin = 1; - } else if (origin == "N"){ - src_origin = 2; - } else if (origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - xy2ring_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_ring; -} - -torch::Tensor xy2xy(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_xy_out = torch::empty_like(data_xy_in); - - int src_origin; - int dest_origin; - - - if (s_origin == "S"){ - src_origin = 0; - } else if (s_origin == "E"){ - src_origin = 1; - } else if (s_origin == "N"){ - src_origin = 2; - } else if (s_origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop"); - } - - if (d_origin == "S"){ - dest_origin = 0; - } else if (d_origin == "E"){ - dest_origin = 1; - } else if (d_origin == "N"){ - dest_origin = 2; - } else if (d_origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop"); - } - - - xy2xy_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_xy_out; -} - - -torch::Tensor ring2nest_batch(torch::Tensor data_in_ring, const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_nest = torch::empty_like(data_in_ring); - - ring2nest_batch_dispatch(data_in_ring, data_in_nest, nside, num_elements); - - return data_in_nest; -} - - -torch::Tensor nest2ring_batch(torch::Tensor data_in_nest, const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_ring = torch::empty_like(data_in_nest); - - nest2ring_batch_dispatch(data_in_nest, data_in_ring, nside, num_elements); - - return data_in_ring; -} - -torch::Tensor nest2xy_batch(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_xy = torch::empty_like(data_in_nest); - - int src_origin = 0; - int dest_origin; - - bool src_clockwise = false; - bool dest_clockwise = clockwise; - - if (origin == "S"){ - dest_origin = 0; - } else if (origin == "E"){ - dest_origin = 1; - } else if (origin == "N"){ - dest_origin = 2; - } else if (origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - nest2xy_batch_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_xy; -} - -torch::Tensor xy2nest_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_nest = torch::empty_like(data_in_xy); - - - int src_origin; - int dest_origin = 0; - - bool src_clockwise = clockwise; - bool dest_clockwise = false; - - if (origin == "S"){ - src_origin = 0; - } else if (origin == "E"){ - src_origin = 1; - } else if (origin == "N"){ - src_origin = 2; - } else if (origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - xy2nest_batch_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_nest; -} - -torch::Tensor ring2xy_batch(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_xy = torch::empty_like(data_in_ring); - - int src_origin = 0; - int dest_origin; - - bool src_clockwise = false; - bool dest_clockwise = clockwise; - - if (origin == "S"){ - dest_origin = 0; - } else if (origin == "E"){ - dest_origin = 1; - } else if (origin == "N"){ - dest_origin = 2; - } else if (origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - ring2xy_batch_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_xy; -} - -torch::Tensor xy2ring_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_in_ring = torch::empty_like(data_in_xy); - - int src_origin; - int dest_origin = 0; - - bool src_clockwise = clockwise; - bool dest_clockwise = false; - - if (origin == "S"){ - src_origin = 0; - } else if (origin == "E"){ - src_origin = 1; - } else if (origin == "N"){ - src_origin = 2; - } else if (origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); - } - - - xy2ring_batch_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_in_ring; -} - -torch::Tensor xy2xy_batch(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise, - const int nside, const size_t num_elements) { - - const size_t expected_num_elements = static_cast(nside)*nside*12; - - // Check if num_elements matches the expected number of elements - TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); - - auto data_xy_out = torch::empty_like(data_xy_in); - - int src_origin; - int dest_origin; - - - if (s_origin == "S"){ - src_origin = 0; - } else if (s_origin == "E"){ - src_origin = 1; - } else if (s_origin == "N"){ - src_origin = 2; - } else if (s_origin == "W"){ - src_origin = 3; - } else { - TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop"); - } - - if (d_origin == "S"){ - dest_origin = 0; - } else if (d_origin == "E"){ - dest_origin = 1; - } else if (d_origin == "N"){ - dest_origin = 2; - } else if (d_origin == "W"){ - dest_origin = 3; - } else { - TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop"); - } - - - xy2xy_batch_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); - - return data_xy_out; -} - - - -PYBIND11_MODULE(cuhpx_remap, m) { - - m.def("ring2nest", &ring2nest, "Convert ring to nest (CUDA)"); - m.def("nest2ring", &nest2ring, "Convert nest to ring (CUDA)"); - - m.def("nest2xy", &nest2xy, "Convert nest to xy (CUDA)"); - m.def("xy2nest", &xy2nest, "Convert xy to nest (CUDA)"); - - m.def("ring2xy", &ring2xy, "Convert ring to xy (CUDA)"); - m.def("xy2ring", &xy2ring, "Convert xy to ring (CUDA)"); - - m.def("xy2xy", &xy2xy, "Convert xy to xy (CUDA)"); - - m.def("ring2nest_batch", &ring2nest_batch, "Convert ring to nest (CUDA) in batch"); - m.def("nest2ring_batch", &nest2ring_batch, "Convert nest to ring (CUDA) in batch"); - - m.def("nest2xy_batch", &nest2xy_batch, "Convert nest to xy (CUDA) in batch"); - m.def("xy2nest_batch", &xy2nest_batch, "Convert xy to nest (CUDA) in batch"); - - m.def("ring2xy_batch", &ring2xy_batch, "Convert ring to xy (CUDA) in batch"); - m.def("xy2ring_batch", &xy2ring_batch, "Convert xy to ring (CUDA) in batch"); - - m.def("xy2xy_batch", &xy2xy_batch, "Convert xy to xy (CUDA) in batch"); - -} - +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "hpx_remapping.h" + +torch::Tensor ring2nest(torch::Tensor data_in_ring, const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_nest = torch::empty_like(data_in_ring); + + ring2nest_dispatch(data_in_ring, data_in_nest, nside, num_elements); + + return data_in_nest; +} + + +torch::Tensor nest2ring(torch::Tensor data_in_nest, const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_ring = torch::empty_like(data_in_nest); + + nest2ring_dispatch(data_in_nest, data_in_ring, nside, num_elements); + + return data_in_ring; +} + +torch::Tensor nest2xy(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_xy = torch::empty_like(data_in_nest); + + int src_origin = 0; + int dest_origin; + + bool src_clockwise = false; + bool dest_clockwise = clockwise; + + if (origin == "S"){ + dest_origin = 0; + } else if (origin == "E"){ + dest_origin = 1; + } else if (origin == "N"){ + dest_origin = 2; + } else if (origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + nest2xy_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_xy; +} + +torch::Tensor xy2nest(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_nest = torch::empty_like(data_in_xy); + + + int src_origin; + int dest_origin = 0; + + bool src_clockwise = clockwise; + bool dest_clockwise = false; + + if (origin == "S"){ + src_origin = 0; + } else if (origin == "E"){ + src_origin = 1; + } else if (origin == "N"){ + src_origin = 2; + } else if (origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + xy2nest_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_nest; +} + +torch::Tensor ring2xy(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_xy = torch::empty_like(data_in_ring); + + int src_origin = 0; + int dest_origin; + + bool src_clockwise = false; + bool dest_clockwise = clockwise; + + if (origin == "S"){ + dest_origin = 0; + } else if (origin == "E"){ + dest_origin = 1; + } else if (origin == "N"){ + dest_origin = 2; + } else if (origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + ring2xy_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_xy; +} + +torch::Tensor xy2ring(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_ring = torch::empty_like(data_in_xy); + + int src_origin; + int dest_origin = 0; + + bool src_clockwise = clockwise; + bool dest_clockwise = false; + + if (origin == "S"){ + src_origin = 0; + } else if (origin == "E"){ + src_origin = 1; + } else if (origin == "N"){ + src_origin = 2; + } else if (origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + xy2ring_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_ring; +} + +torch::Tensor xy2xy(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_xy_out = torch::empty_like(data_xy_in); + + int src_origin; + int dest_origin; + + + if (s_origin == "S"){ + src_origin = 0; + } else if (s_origin == "E"){ + src_origin = 1; + } else if (s_origin == "N"){ + src_origin = 2; + } else if (s_origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop"); + } + + if (d_origin == "S"){ + dest_origin = 0; + } else if (d_origin == "E"){ + dest_origin = 1; + } else if (d_origin == "N"){ + dest_origin = 2; + } else if (d_origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop"); + } + + + xy2xy_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_xy_out; +} + +void benchmark_nest_ring(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + benchmark_nest_ring_dispatch(data_in_nest, data_in_ring, nside, num_elements); +} + + +torch::Tensor ring2nest_batch(torch::Tensor data_in_ring, const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_nest = torch::empty_like(data_in_ring); + + ring2nest_batch_dispatch(data_in_ring, data_in_nest, nside, num_elements); + + return data_in_nest; +} + + +torch::Tensor nest2ring_batch(torch::Tensor data_in_nest, const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_ring = torch::empty_like(data_in_nest); + + nest2ring_batch_dispatch(data_in_nest, data_in_ring, nside, num_elements); + + return data_in_ring; +} + +torch::Tensor nest2xy_batch(torch::Tensor data_in_nest, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_xy = torch::empty_like(data_in_nest); + + int src_origin = 0; + int dest_origin; + + bool src_clockwise = false; + bool dest_clockwise = clockwise; + + if (origin == "S"){ + dest_origin = 0; + } else if (origin == "E"){ + dest_origin = 1; + } else if (origin == "N"){ + dest_origin = 2; + } else if (origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + nest2xy_batch_dispatch(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_xy; +} + +torch::Tensor xy2nest_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_nest = torch::empty_like(data_in_xy); + + + int src_origin; + int dest_origin = 0; + + bool src_clockwise = clockwise; + bool dest_clockwise = false; + + if (origin == "S"){ + src_origin = 0; + } else if (origin == "E"){ + src_origin = 1; + } else if (origin == "N"){ + src_origin = 2; + } else if (origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + xy2nest_batch_dispatch(data_in_xy, data_in_nest, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_nest; +} + +torch::Tensor ring2xy_batch(torch::Tensor data_in_ring, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_xy = torch::empty_like(data_in_ring); + + int src_origin = 0; + int dest_origin; + + bool src_clockwise = false; + bool dest_clockwise = clockwise; + + if (origin == "S"){ + dest_origin = 0; + } else if (origin == "E"){ + dest_origin = 1; + } else if (origin == "N"){ + dest_origin = 2; + } else if (origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + ring2xy_batch_dispatch(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_xy; +} + +torch::Tensor xy2ring_batch(torch::Tensor data_in_xy, const std::string& origin, const bool clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_in_ring = torch::empty_like(data_in_xy); + + int src_origin; + int dest_origin = 0; + + bool src_clockwise = clockwise; + bool dest_clockwise = false; + + if (origin == "S"){ + src_origin = 0; + } else if (origin == "E"){ + src_origin = 1; + } else if (origin == "N"){ + src_origin = 2; + } else if (origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The origin must be one from S, E, N, W. Stop"); + } + + + xy2ring_batch_dispatch(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_in_ring; +} + +torch::Tensor xy2xy_batch(torch::Tensor data_xy_in, const std::string& s_origin, const bool src_clockwise, const std::string& d_origin, const bool dest_clockwise, + const int nside, const size_t num_elements) { + + const size_t expected_num_elements = static_cast(nside)*nside*12; + + // Check if num_elements matches the expected number of elements + TORCH_CHECK(num_elements == expected_num_elements, "The number of elements in the input array is not equal to the number of HEALPix grid at current nside. Stop."); + + auto data_xy_out = torch::empty_like(data_xy_in); + + int src_origin; + int dest_origin; + + + if (s_origin == "S"){ + src_origin = 0; + } else if (s_origin == "E"){ + src_origin = 1; + } else if (s_origin == "N"){ + src_origin = 2; + } else if (s_origin == "W"){ + src_origin = 3; + } else { + TORCH_CHECK(false, "The src origin must be one from S, E, N, W. Stop"); + } + + if (d_origin == "S"){ + dest_origin = 0; + } else if (d_origin == "E"){ + dest_origin = 1; + } else if (d_origin == "N"){ + dest_origin = 2; + } else if (d_origin == "W"){ + dest_origin = 3; + } else { + TORCH_CHECK(false, "The dest origin must be one from S, E, N, W. Stop"); + } + + + xy2xy_batch_dispatch(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); + + return data_xy_out; +} + + + +PYBIND11_MODULE(cuhpx_remap, m) { + + m.def("ring2nest", &ring2nest, "Convert ring to nest (CUDA)"); + m.def("nest2ring", &nest2ring, "Convert nest to ring (CUDA)"); + + m.def("nest2xy", &nest2xy, "Convert nest to xy (CUDA)"); + m.def("xy2nest", &xy2nest, "Convert xy to nest (CUDA)"); + + m.def("ring2xy", &ring2xy, "Convert ring to xy (CUDA)"); + m.def("xy2ring", &xy2ring, "Convert xy to ring (CUDA)"); + + m.def("xy2xy", &xy2xy, "Convert xy to xy (CUDA)"); + + m.def("benchmark_nest_ring", &benchmark_nest_ring, "Benchmark nest and ring (CUDA)"); + + m.def("ring2nest_batch", &ring2nest_batch, "Convert ring to nest (CUDA) in batch"); + m.def("nest2ring_batch", &nest2ring_batch, "Convert nest to ring (CUDA) in batch"); + + m.def("nest2xy_batch", &nest2xy_batch, "Convert nest to xy (CUDA) in batch"); + m.def("xy2nest_batch", &xy2nest_batch, "Convert xy to nest (CUDA) in batch"); + + m.def("ring2xy_batch", &ring2xy_batch, "Convert ring to xy (CUDA) in batch"); + m.def("xy2ring_batch", &xy2ring_batch, "Convert xy to ring (CUDA) in batch"); + + m.def("xy2xy_batch", &xy2xy_batch, "Convert xy to xy (CUDA) in batch"); + +} diff --git a/src/data_remapping/hpx_remapping.h b/src/data_remapping/hpx_remapping.h index 5ed936a..2edc77d 100644 --- a/src/data_remapping/hpx_remapping.h +++ b/src/data_remapping/hpx_remapping.h @@ -43,6 +43,9 @@ void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, cons void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise, const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements); +void benchmark_nest_ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements); + + void ring2nest_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements); void nest2ring_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements); @@ -63,4 +66,3 @@ void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, c const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements); #endif // HPX_REMAPPING_H - diff --git a/src/data_remapping/hpx_remapping_cuda.cu b/src/data_remapping/hpx_remapping_cuda.cu index eb0f677..41cf8f0 100644 --- a/src/data_remapping/hpx_remapping_cuda.cu +++ b/src/data_remapping/hpx_remapping_cuda.cu @@ -133,15 +133,15 @@ __device__ inline void correct_xy_orient(int& ix, int& iy, const int order, cons iy = new_iy; } -template +template __global__ void rearrange_data_kernel_naive(const T* d_data_in, T* d_data_out, const I* d_pix_array, const size_t num_elements){ - + int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < num_elements){ I pix = d_pix_array[tid]; d_data_out[pix] = d_data_in[tid]; - + } } @@ -157,7 +157,7 @@ template } -template +template __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array, int* face_num_array, const int order, const I nside, const size_t num_elements){ @@ -192,10 +192,10 @@ __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array I irm = 2*nside + 1 - tmp; I ifm = iphi - (ire >> 1) + nside - 1; I ifp = iphi - (irm >> 1) + nside - 1; - + ifm >>= order; ifp >>= order; - + face_num = (ifp == ifm) ? (ifp | 4) : ((ifp < ifm) ? ifp : (ifm + 8)); } else { // South Polar Cap I ip = npix - pix; @@ -224,8 +224,8 @@ __global__ void ring2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array } -template -__global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array, +template +__global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array, const int order, const I nside, const size_t num_elements) { int tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -281,7 +281,7 @@ __global__ void xyf2ring_kernel(I* pix_array, const int* ix_array, const int* iy } } -template +template __global__ void nest2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array, int* face_num_array, const int order, const I nside, const size_t num_elements) { @@ -303,7 +303,7 @@ __global__ void nest2xyf_kernel(const I* pix_array, int* ix_array, int* iy_array } } -template +template __global__ void xyf2nest_kernel(I* pix_array, const int* ix_array, const int* iy_array, const int* face_num_array, const int order, const I nside, const size_t num_elements){ @@ -601,7 +601,7 @@ void ring2nest_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_ ring2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); rearrange_data_kernel_naive <<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, num_elements); - + //cudaDeviceSynchronize(); cudaFree(d_pix_array); @@ -633,7 +633,7 @@ void nest2ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); xyf2ring_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); rearrange_data_kernel_naive <<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, num_elements); - + //cudaDeviceSynchronize(); cudaFree(d_pix_array); @@ -643,6 +643,170 @@ void nest2ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ } +template +void benchmark_nest_ring_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) { + + int block_size = 1024; + int num_blocks = (num_elements + block_size - 1) / block_size; + + int order = compute_order(nside); + + int* d_pix_array; + int* d_ix_array; + int* d_iy_array; + int* d_face_num_array; + + cudaMalloc(&d_pix_array, num_elements * sizeof(int)); + cudaMalloc(&d_ix_array, num_elements * sizeof(int)); + cudaMalloc(&d_iy_array, num_elements * sizeof(int)); + cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); + + + float total_elapsed_time_r2n = 0.0f; + float total_elapsed_time_n2r = 0.0f; + + float total_elapsed_time_n2xy = 0.0f; + float total_elapsed_time_xy2n = 0.0f; + + float elapsed_time; + + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + int num_runs = 3; + int k = 2; + bool flip = false; + + // warm-up run + for (int i = 0; i < num_runs; ++i) + { + // nest 2 ring + initialize_pix_array<<>>(d_pix_array, num_elements); + nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + xyf2ring_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + rearrange_data_kernel_naive <<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, num_elements); + + // ring 2 nest + initialize_pix_array<<>>(d_pix_array, num_elements); + ring2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + rearrange_data_kernel_naive <<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, num_elements); + + // nest2xy + initialize_pix_array<<>>(d_pix_array, num_elements); + nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + xyf2flat_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); + rearrange_data_kernel_naive <<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, num_elements); + + // xy2nest + initialize_pix_array<<>>(d_pix_array, num_elements); + flat2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); + xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + rearrange_data_kernel_naive <<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, num_elements); + } + + cudaDeviceSynchronize(); + + num_runs = 10; + + + for (int i = 0; i < num_runs; ++i) + { + + // N2R + cudaEventRecord(start, stream); + + initialize_pix_array<<>>(d_pix_array, num_elements); + nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + xyf2ring_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + rearrange_data_kernel_naive <<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, num_elements); + + cudaEventRecord(stop, stream); + cudaEventSynchronize(stop); + + elapsed_time = 0.0f; + cudaEventElapsedTime(&elapsed_time, start, stop); + total_elapsed_time_n2r += elapsed_time; + + // R2N + + cudaEventRecord(start, stream); + + initialize_pix_array<<>>(d_pix_array, num_elements); + ring2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + rearrange_data_kernel_naive <<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, num_elements); + + cudaEventRecord(stop, stream); + cudaEventSynchronize(stop); + + elapsed_time = 0.0f; + cudaEventElapsedTime(&elapsed_time, start, stop); + total_elapsed_time_r2n += elapsed_time; + + // nest2xy + cudaEventRecord(start, stream); + + initialize_pix_array<<>>(d_pix_array, num_elements); + nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + xyf2flat_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); + rearrange_data_kernel_naive <<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, num_elements); + + cudaEventRecord(stop, stream); + cudaEventSynchronize(stop); + + elapsed_time = 0.0f; + cudaEventElapsedTime(&elapsed_time, start, stop); + total_elapsed_time_n2xy += elapsed_time; + + // xy2nest + cudaEventRecord(start, stream); + + initialize_pix_array<<>>(d_pix_array, num_elements); + flat2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); + xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + rearrange_data_kernel_naive <<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, num_elements); + + cudaEventRecord(stop, stream); + cudaEventSynchronize(stop); + + elapsed_time = 0.0f; + cudaEventElapsedTime(&elapsed_time, start, stop); + total_elapsed_time_xy2n += elapsed_time; + + } + + float average_elapsed_time_n2r = total_elapsed_time_n2r / num_runs; + float average_elapsed_time_r2n = total_elapsed_time_r2n / num_runs; + float average_elapsed_time_n2xy = total_elapsed_time_n2xy / num_runs; + float average_elapsed_time_xy2n = total_elapsed_time_xy2n / num_runs; + + + std::cout << "Average elapsed time used by CUDA data ring2nest kernel over " << num_runs << " runs: " << average_elapsed_time_r2n << "ms" << std::endl; + std::cout << "Average elapsed time used by CUDA data nest2ring kernel over " << num_runs << " runs: " << average_elapsed_time_n2r << "ms" << std::endl; + + std::cout << "Average elapsed time used by CUDA data nest2XY kernel over " << num_runs << " runs: " << average_elapsed_time_n2xy << "ms" << std::endl; + std::cout << "Average elapsed time used by CUDA data XY2nest kernel over " << num_runs << " runs: " << average_elapsed_time_xy2n << "ms" << std::endl; + + cudaEventDestroy(start); + cudaEventDestroy(stop); + cudaStreamDestroy(stream); + + cudaDeviceSynchronize(); + + + cudaFree(d_pix_array); + cudaFree(d_ix_array); + cudaFree(d_iy_array); + cudaFree(d_face_num_array); +} + + // Dispatch functions for each wrapper void ring2nest_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements) { @@ -661,11 +825,11 @@ void nest2ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, void nest2xy_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise, const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { - + AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "nest2xy", ([&] { nest2xy_kernel_wrapper(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2nest_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_nest, const int src_origin, const bool src_clockwise, @@ -683,7 +847,7 @@ void ring2xy_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_xy, cons AT_DISPATCH_ALL_TYPES(data_in_ring.scalar_type(), "ring2xy", ([&] { ring2xy_kernel_wrapper(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, const int src_origin, const bool src_clockwise, @@ -692,7 +856,7 @@ void xy2ring_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, cons AT_DISPATCH_ALL_TYPES(data_in_xy.scalar_type(), "xy2ring", ([&] { xy2ring_kernel_wrapper(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise, @@ -701,51 +865,61 @@ void xy2xy_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const i AT_DISPATCH_ALL_TYPES(data_xy_in.scalar_type(), "xy2xy", ([&] { xy2xy_kernel_wrapper(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } +void benchmark_nest_ring_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) { + + AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "benchmark_nest_ring", ([&] { + benchmark_nest_ring_kernel_wrapper(data_in_nest, data_in_ring, nside, num_elements); + })); + +} + + + template -__global__ void rearrange_data_kernel_3d_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, +__global__ void rearrange_data_kernel_3d_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, const size_t m, const size_t n, const size_t num_elements) { - + // d_data_in: m by n by num_elements 3D matrix int idz = blockIdx.z * blockDim.z + threadIdx.z; int idy = blockIdx.y * blockDim.y + threadIdx.y; int idx = blockIdx.x * blockDim.x + threadIdx.x; - + int tid = idz * n * num_elements + idy * num_elements + idx; - + if (idz < m && idy < n && idx < num_elements) { - I pix = d_pix_array[idx]; + I pix = d_pix_array[idx]; d_data_out[tid-idx + pix] = d_data_in[tid]; } } template -__global__ void rearrange_data_kernel_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, +__global__ void rearrange_data_kernel_batch(const T* d_data_in, T* d_data_out, const I* d_pix_array, const size_t n, const size_t num_elements) { unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - + unsigned int tid = idy * num_elements + idx; - + if (idy < n && idx < num_elements) { - I pix = d_pix_array[idx]; + I pix = d_pix_array[idx]; d_data_out[tid-idx + pix] = d_data_in[tid]; } } template -void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out, +void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, + const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -791,9 +965,9 @@ void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_ template -void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy, +void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, + const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -848,9 +1022,9 @@ void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data } template -void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest, +void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest, const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, + const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -905,9 +1079,9 @@ void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_i } template -void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy, +void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, + const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -963,9 +1137,9 @@ void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data template -void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring, +void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring, const int src_origin, const bool src_clockwise, - const int dest_origin, const bool dest_clockwise, + const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -1021,7 +1195,7 @@ void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_i template -void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest, +void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -1070,7 +1244,7 @@ void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor da } template -void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, +void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, const int nside, const size_t num_elements) { // Check tensor dimensions and sizes @@ -1118,6 +1292,372 @@ void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor da cudaFree(d_face_num_array); } +/* +template +void xy2xy_batch_kernel_wrapper(torch::Tensor data_xy_in, torch::Tensor data_xy_out, + const int src_origin, const bool src_clockwise, + const int dest_origin, const bool dest_clockwise, + const int nside, const size_t num_elements) { + + // Check tensor dimensions and sizes + TORCH_CHECK(data_xy_in.dim() == 3, "data_xy_in must be a 3D tensor"); + TORCH_CHECK(data_xy_out.dim() == 3, "data_xy_out must be a 3D tensor"); + + int m = data_xy_in.size(0); // First dimension + int n = data_xy_in.size(1); // Second dimension + + int order = compute_order(nside); + + int block_size = 256; + int num_blocks = (num_elements + block_size - 1)/ block_size; + + // Calculate block and grid dimensions + dim3 block_dim(32, 4, 4); // Example 3D block dimensions + dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, + (n + block_dim.y - 1) / block_dim.y, + (m + block_dim.z - 1) / block_dim.z); + + // Allocate device memory + int* d_pix_array; + + cudaMalloc(&d_pix_array, num_elements * sizeof(int)); + + // Compute rotation parameters + bool flip = src_clockwise != dest_clockwise; + int rotations = dest_origin - src_origin; + int k = (dest_clockwise ? -rotations : rotations) & 3; + if (k < 0) { k += 4; } + + // Launch kernels + initialize_pix_array<<>>(d_pix_array, num_elements); + flat2flat_kernel<<>>(d_pix_array, order, nside, num_elements, flip, k); + + // Adjust kernel launch for 3D data + rearrange_data_kernel_3d_batch<<>>(data_xy_in.data_ptr(), data_xy_out.data_ptr(), d_pix_array, m, n, num_elements); + + // Free device memory + cudaFree(d_pix_array); +} + + +template +void nest2xy_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_xy, + const int src_origin, const bool src_clockwise, + const int dest_origin, const bool dest_clockwise, + const int nside, const size_t num_elements) { + + // Check tensor dimensions and sizes + TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest should be a 3D tensor"); + TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy should be a 3D tensor"); + + int m = data_in_nest.size(0); // First dimension + int n = data_in_nest.size(1); // Second dimension + + int order = compute_order(nside); + + // Calculate block and grid dimensions + dim3 block_dim(32, 4, 4); // Example 3D block dimensions + dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, + (n + block_dim.y - 1) / block_dim.y, + (m + block_dim.z - 1) / block_dim.z); + + int block_size = 256; + int num_blocks = (num_elements + block_size - 1)/ block_size; + + // Allocate device memory + int* d_pix_array; + int* d_ix_array; + int* d_iy_array; + int* d_face_num_array; + + cudaMalloc(&d_pix_array, num_elements * sizeof(int)); + cudaMalloc(&d_ix_array, num_elements * sizeof(int)); + cudaMalloc(&d_iy_array, num_elements * sizeof(int)); + cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); + + // Compute rotation parameters + bool flip = src_clockwise != dest_clockwise; + int rotations = dest_origin - src_origin; + int k = (dest_clockwise ? -rotations : rotations) & 3; + if (k < 0) { k += 4; } + + // Launch kernels + initialize_pix_array<<>>(d_pix_array, num_elements); + nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + xyf2flat_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); + + // Adjust kernel launch for 3D data + rearrange_data_kernel_3d_batch<<>>(data_in_nest.data_ptr(), data_in_xy.data_ptr(), d_pix_array, m, n, num_elements); + + // Free device memory + cudaFree(d_pix_array); + cudaFree(d_ix_array); + cudaFree(d_iy_array); + cudaFree(d_face_num_array); +} + +template +void xy2nest_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_nest, + const int src_origin, const bool src_clockwise, + const int dest_origin, const bool dest_clockwise, + const int nside, const size_t num_elements) { + + // Check tensor dimensions and sizes + TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor"); + TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor"); + + int m = data_in_xy.size(0); // First dimension + int n = data_in_xy.size(1); // Second dimension + + int order = compute_order(nside); + + int block_size = 256; + int num_blocks = (num_elements + block_size - 1)/ block_size; + + // Calculate block and grid dimensions + dim3 block_dim(32, 4, 4); // Example 3D block dimensions + dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, + (n + block_dim.y - 1) / block_dim.y, + (m + block_dim.z - 1) / block_dim.z); + + // Allocate device memory + int* d_pix_array; + int* d_ix_array; + int* d_iy_array; + int* d_face_num_array; + + cudaMalloc(&d_pix_array, num_elements * sizeof(int)); + cudaMalloc(&d_ix_array, num_elements * sizeof(int)); + cudaMalloc(&d_iy_array, num_elements * sizeof(int)); + cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); + + // Compute rotation parameters + bool flip = src_clockwise != dest_clockwise; + int rotations = dest_origin - src_origin; + int k = (dest_clockwise ? -rotations : rotations) & 3; + if (k < 0) { k += 4; } + + // Launch kernels + initialize_pix_array<<>>(d_pix_array, num_elements); + flat2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); + xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + + // Adjust kernel launch for 3D data + rearrange_data_kernel_3d_batch<<>>(data_in_xy.data_ptr(), data_in_nest.data_ptr(), d_pix_array, m, n, num_elements); + + // Free device memory + cudaFree(d_pix_array); + cudaFree(d_ix_array); + cudaFree(d_iy_array); + cudaFree(d_face_num_array); +} + +template +void ring2xy_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_xy, + const int src_origin, const bool src_clockwise, + const int dest_origin, const bool dest_clockwise, + const int nside, const size_t num_elements) { + + // Check tensor dimensions and sizes + TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor"); + TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor"); + + int m = data_in_ring.size(0); // First dimension + int n = data_in_ring.size(1); // Second dimension + int order = compute_order(nside); + + int block_size = 256; + int num_blocks = (num_elements + block_size - 1)/ block_size; + + // Calculate block and grid dimensions + dim3 block_dim(32, 4, 4); // Example 3D block dimensions + dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, + (n + block_dim.y - 1) / block_dim.y, + (m + block_dim.z - 1) / block_dim.z); + + // Allocate device memory + int* d_pix_array; + int* d_ix_array; + int* d_iy_array; + int* d_face_num_array; + + cudaMalloc(&d_pix_array, num_elements * sizeof(int)); + cudaMalloc(&d_ix_array, num_elements * sizeof(int)); + cudaMalloc(&d_iy_array, num_elements * sizeof(int)); + cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); + + // Compute rotation parameters + bool flip = src_clockwise != dest_clockwise; + int rotations = dest_origin - src_origin; + int k = (dest_clockwise ? -rotations : rotations) & 3; + if (k < 0) { k += 4; } + + // Launch kernels + initialize_pix_array<<>>(d_pix_array, num_elements); + ring2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + xyf2flat_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); + + // Adjust kernel launch for 3D data + rearrange_data_kernel_3d_batch<<>>(data_in_ring.data_ptr(), data_in_xy.data_ptr(), d_pix_array, m, n, num_elements); + + // Free device memory + cudaFree(d_pix_array); + cudaFree(d_ix_array); + cudaFree(d_iy_array); + cudaFree(d_face_num_array); +} + + +template +void xy2ring_batch_kernel_wrapper(torch::Tensor data_in_xy, torch::Tensor data_in_ring, + const int src_origin, const bool src_clockwise, + const int dest_origin, const bool dest_clockwise, + const int nside, const size_t num_elements) { + + // Check tensor dimensions and sizes + TORCH_CHECK(data_in_xy.dim() == 3, "data_in_xy must be a 3D tensor"); + TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor"); + + int m = data_in_xy.size(0); // First dimension + int n = data_in_xy.size(1); // Second dimension + int order = compute_order(nside); + + int block_size = 256; + int num_blocks = (num_elements + block_size - 1)/ block_size; + + // Calculate block and grid dimensions + dim3 block_dim(32, 4, 4); // Example 3D block dimensions + dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, + (n + block_dim.y - 1) / block_dim.y, + (m + block_dim.z - 1) / block_dim.z); + + // Allocate device memory + int* d_pix_array; + int* d_ix_array; + int* d_iy_array; + int* d_face_num_array; + + cudaMalloc(&d_pix_array, num_elements * sizeof(int)); + cudaMalloc(&d_ix_array, num_elements * sizeof(int)); + cudaMalloc(&d_iy_array, num_elements * sizeof(int)); + cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); + + // Compute rotation parameters + bool flip = src_clockwise != dest_clockwise; + int rotations = dest_origin - src_origin; + int k = (dest_clockwise ? -rotations : rotations) & 3; + if (k < 0) { k += 4; } + + // Launch kernels + initialize_pix_array<<>>(d_pix_array, num_elements); + flat2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements, flip, k); + xyf2ring_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + + // Adjust kernel launch for 3D data + rearrange_data_kernel_3d_batch<<>>(data_in_xy.data_ptr(), data_in_ring.data_ptr(), d_pix_array, m, n, num_elements); + + // Free device memory + cudaFree(d_pix_array); + cudaFree(d_ix_array); + cudaFree(d_iy_array); + cudaFree(d_face_num_array); +} + + +template +void ring2nest_batch_kernel_wrapper(torch::Tensor data_in_ring, torch::Tensor data_in_nest, + const int nside, const size_t num_elements) { + + // Check tensor dimensions and sizes + TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor"); + TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor"); + + int m = data_in_ring.size(0); // First dimension + int n = data_in_ring.size(1); // Second dimension + int order = compute_order(nside); + + int block_size = 256; + int num_blocks = (num_elements + block_size - 1)/ block_size; + + // Calculate block and grid dimensions + dim3 block_dim(32, 4, 4); // Example 3D block dimensions + dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, + (n + block_dim.y - 1) / block_dim.y, + (m + block_dim.z - 1) / block_dim.z); + + // Allocate device memory + int* d_pix_array; + int* d_ix_array; + int* d_iy_array; + int* d_face_num_array; + + cudaMalloc(&d_pix_array, num_elements * sizeof(int)); + cudaMalloc(&d_ix_array, num_elements * sizeof(int)); + cudaMalloc(&d_iy_array, num_elements * sizeof(int)); + cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); + + // Launch kernels + initialize_pix_array<<>>(d_pix_array, num_elements); + ring2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + xyf2nest_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + + // Adjust kernel launch for 3D data + rearrange_data_kernel_3d_batch<<>>(data_in_ring.data_ptr(), data_in_nest.data_ptr(), d_pix_array, m, n, num_elements); + + // Free device memory + cudaFree(d_pix_array); + cudaFree(d_ix_array); + cudaFree(d_iy_array); + cudaFree(d_face_num_array); +} + +template +void nest2ring_batch_kernel_wrapper(torch::Tensor data_in_nest, torch::Tensor data_in_ring, + const int nside, const size_t num_elements) { + + // Check tensor dimensions and sizes + TORCH_CHECK(data_in_nest.dim() == 3, "data_in_nest must be a 3D tensor"); + TORCH_CHECK(data_in_ring.dim() == 3, "data_in_ring must be a 3D tensor"); + + int m = data_in_nest.size(0); // First dimension + int n = data_in_nest.size(1); // Second dimension + int order = compute_order(nside); + + int block_size = 256; + int num_blocks = (num_elements + block_size - 1)/ block_size; + + // Calculate block and grid dimensions + dim3 block_dim(32, 4, 4); // Example 3D block dimensions + dim3 grid_dim((num_elements + block_dim.x - 1) / block_dim.x, + (n + block_dim.y - 1) / block_dim.y, + (m + block_dim.z - 1) / block_dim.z); + + // Allocate device memory + int* d_pix_array; + int* d_ix_array; + int* d_iy_array; + int* d_face_num_array; + + cudaMalloc(&d_pix_array, num_elements * sizeof(int)); + cudaMalloc(&d_ix_array, num_elements * sizeof(int)); + cudaMalloc(&d_iy_array, num_elements * sizeof(int)); + cudaMalloc(&d_face_num_array, num_elements * sizeof(int)); + + // Launch kernels + initialize_pix_array<<>>(d_pix_array, num_elements); + nest2xyf_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + xyf2ring_kernel<<>>(d_pix_array, d_ix_array, d_iy_array, d_face_num_array, order, nside, num_elements); + + // Adjust kernel launch for 3D data + rearrange_data_kernel_3d_batch<<>>(data_in_nest.data_ptr(), data_in_ring.data_ptr(), d_pix_array, m, n, num_elements); + + // Free device memory + cudaFree(d_pix_array); + cudaFree(d_ix_array); + cudaFree(d_iy_array); + cudaFree(d_face_num_array); +} +*/ // Dispatch functions for each wrapper void ring2nest_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_nest, const int nside, const size_t num_elements) { @@ -1137,11 +1677,11 @@ void nest2ring_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_ void nest2xy_batch_dispatch(torch::Tensor data_in_nest, torch::Tensor data_in_xy, const int src_origin, const bool src_clockwise, const int dest_origin, const bool dest_clockwise, const int nside, const size_t num_elements) { - + AT_DISPATCH_ALL_TYPES(data_in_nest.scalar_type(), "nest2xy_batch", ([&] { nest2xy_batch_kernel_wrapper(data_in_nest, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2nest_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_nest, const int src_origin, const bool src_clockwise, @@ -1159,7 +1699,7 @@ void ring2xy_batch_dispatch(torch::Tensor data_in_ring, torch::Tensor data_in_xy AT_DISPATCH_ALL_TYPES(data_in_ring.scalar_type(), "ring2xy_batch", ([&] { ring2xy_batch_kernel_wrapper(data_in_ring, data_in_xy, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2ring_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring, const int src_origin, const bool src_clockwise, @@ -1168,7 +1708,7 @@ void xy2ring_batch_dispatch(torch::Tensor data_in_xy, torch::Tensor data_in_ring AT_DISPATCH_ALL_TYPES(data_in_xy.scalar_type(), "xy2ring_batch", ([&] { xy2ring_batch_kernel_wrapper(data_in_xy, data_in_ring, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - + } void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, const int src_origin, const bool src_clockwise, @@ -1177,6 +1717,5 @@ void xy2xy_batch_dispatch(torch::Tensor data_xy_in, torch::Tensor data_xy_out, c AT_DISPATCH_ALL_TYPES(data_xy_in.scalar_type(), "xy2xy_batch", ([&] { xy2xy_batch_kernel_wrapper(data_xy_in, data_xy_out, src_origin, src_clockwise, dest_origin, dest_clockwise, nside, num_elements); })); - -} +} diff --git a/src/harmonic_transform/hpx_fft.cpp b/src/harmonic_transform/hpx_fft.cpp index ddacfd2..9ec28ac 100644 --- a/src/harmonic_transform/hpx_fft.cpp +++ b/src/harmonic_transform/hpx_fft.cpp @@ -15,7 +15,6 @@ * limitations under the License. */ -#include #include "hpx_fft.h" #include @@ -422,6 +421,11 @@ torch::Tensor healpix_irfft_cuda(torch::Tensor ftm, int L, int nside) { } +// PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { +// m.def("healpix_rfft_cuda", &healpix_rfft_cuda, "HEALPix RFFT with cuFFT and CUDA"); +// m.def("healpix_irfft_cuda", &healpix_irfft_cuda, "HEALPix IRFFT with cuFFT and CUDA"); +// } + int nphi_ring(int t, int nside); int cumulative_nphi_ring(int t, int nside); double p2phi_ring(int t, int offset, int nside); From 5de3f7c6a2afd45d68368d139c11a25676021bf2 Mon Sep 17 00:00:00 2001 From: xiaopoc Date: Wed, 13 Aug 2025 11:10:19 -0700 Subject: [PATCH 6/6] update --- pyproject.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 326b0fa..7cf4538 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,8 +7,7 @@ name = "cuHPX" version = "2025.8.1" description = "CUDA-accelerated HEALPix tools for harmonic transforms and remapping" authors = [ - { name = "Xiaopo Cheng", email = "xiaopoc@nvidia.com" }, - { name = "Akshay Subramaniam", email = "asubramaniam@nvidia.com" } + { name = "NVIDIA", email = "asubramaniam@nvidia.com" } ] readme = "README.md" license = { file = "LICENSE.txt" }