|
| 1 | +#include "configure_logging.hpp" |
| 2 | + |
| 3 | +#include <CLI/CLI.hpp>// NOLINT |
| 4 | + |
| 5 | +#include <cudf/concatenate.hpp> |
| 6 | +#include <cudf/io/datasource.hpp> |
| 7 | +#include <cudf/io/parquet.hpp> |
| 8 | +#include <cudf/io/types.hpp> |
| 9 | +#include <cudf/sorting.hpp> |
| 10 | +#include <cudf/table/table.hpp> |
| 11 | +#include <cudf/table/table_view.hpp> |
| 12 | +#include <cudf/types.hpp> |
| 13 | +#include <cudf/utilities/error.hpp> |
| 14 | +#include <internal_use_only/config.hpp> |
| 15 | +#include <rmm/cuda_stream_view.hpp> |
| 16 | +#include <rmm/mr/device/owning_wrapper.hpp> |
| 17 | +#include <rmm/mr/device/pool_memory_resource.hpp> |
| 18 | +#ifdef SPDLOG_ACTIVE_LEVEL |
| 19 | +#undef SPDLOG_ACTIVE_LEVEL |
| 20 | +#endif |
| 21 | +#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG |
| 22 | +#include <spdlog/spdlog.h> |
| 23 | + |
| 24 | +#include <cstddef> |
| 25 | +#include <memory> |
| 26 | +#include <string> |
| 27 | + |
| 28 | +int main(int argc, char **argv) { |
| 29 | + configure_logging(); |
| 30 | + // NOLINTNEXTLINE |
| 31 | + CLI::App app{ "Simple program to check if Parquet file is sorted with cuDF", "check_sort" }; |
| 32 | + app.set_version_flag("--version", std::string{ gpu_compact::cmake::project_version }); |
| 33 | + |
| 34 | + std::string inputFile; |
| 35 | + app.add_option("input", inputFile, "Input Parquet file")->required(); |
| 36 | + std::string colName; |
| 37 | + app.add_option("Column name", colName, "Column to validate sort order")->required(); |
| 38 | + CLI11_PARSE(app, argc, argv);// NOLINT |
| 39 | + |
| 40 | + // force gpu initialization so it's not included in the time |
| 41 | + rmm::cuda_stream_default.synchronize(); |
| 42 | + |
| 43 | + auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>(); |
| 44 | + auto mr = |
| 45 | + rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(cuda_mr, rmm::percent_of_free_device_memory(95)); |
| 46 | + rmm::mr::set_current_device_resource(mr.get()); |
| 47 | + |
| 48 | + auto opts = |
| 49 | + cudf::io::parquet_reader_options::builder(cudf::io::source_info(inputFile)).columns({ colName }).build(); |
| 50 | + cudf::io::chunked_parquet_reader reader{ 500 * 1'048'576l, 500 * 1'048'576l, opts }; |
| 51 | + |
| 52 | + SPDLOG_INFO("Validating sort on column '{}' in file '{}'", colName, inputFile); |
| 53 | + |
| 54 | + // Loop doing reads |
| 55 | + // A sort problem may occur across a chunk boundary, so we use a slightly hacky workaround of keeping |
| 56 | + // the last chunk stored and then concatenate the current chunk on to it and then check that for sorting. |
| 57 | + ::size_t totalRowsRead = 0; |
| 58 | + ::size_t chunkNo = 0; |
| 59 | + cudf::io::table_with_metadata prevTable; |
| 60 | + while (reader.has_next()) { |
| 61 | + auto currentTable = reader.read_chunk(); |
| 62 | + ::size_t rowsRead = currentTable.tbl->num_rows(); |
| 63 | + SPDLOG_INFO("Checking chunk number {:d} has {:d} rows", chunkNo, rowsRead); |
| 64 | + |
| 65 | + std::unique_ptr<cudf::table> checkTable; |
| 66 | + cudf::table_view checkView = currentTable.tbl->view(); |
| 67 | + if (prevTable.tbl) { |
| 68 | + checkTable = cudf::concatenate(std::vector<cudf::table_view>{ *prevTable.tbl, *currentTable.tbl }); |
| 69 | + checkView = std::move(checkTable->view()); |
| 70 | + } |
| 71 | + bool chunkSorted = cudf::is_sorted(checkView, { cudf::order::ASCENDING }, { cudf::null_order::AFTER }); |
| 72 | + if (!chunkSorted) { |
| 73 | + SPDLOG_ERROR("Chunk number {:d} contains an incorrect sort order between rows [{:d},{:d})", |
| 74 | + chunkNo, |
| 75 | + totalRowsRead, |
| 76 | + totalRowsRead + rowsRead); |
| 77 | + CUDF_FAIL("Incorrect sort detected"); |
| 78 | + } |
| 79 | + chunkNo++; |
| 80 | + totalRowsRead += rowsRead; |
| 81 | + prevTable = std::move(currentTable); |
| 82 | + } |
| 83 | + |
| 84 | + SPDLOG_INFO("Finished, file is correctly sorted"); |
| 85 | +} |
0 commit comments