Skip to content

Commit 399a152

Browse files
committed
Fix number reporting and refactored
1 parent d3add2b commit 399a152

File tree

1 file changed

+45
-31
lines changed

1 file changed

+45
-31
lines changed

cpp/src/cmdline/chunk_reader.cpp

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,20 @@
3838
#include <utility>
3939
#include <vector>
4040

41-
::size_t calcRowsWritten(auto const &readers) noexcept {
41+
std::size_t calcRowsRead(auto const &readers) noexcept {
4242
return std::accumulate(
4343
readers.cbegin(), readers.cend(), ::size_t{ 0 }, [](auto &&acc, auto const &item) constexpr noexcept {
4444
return acc + std::get<3>(item);
4545
});
4646
}
4747

48+
std::size_t calcRowsInViews(auto const &views) noexcept {
49+
return std::accumulate(
50+
views.cbegin(), views.cend(), ::size_t{ 0 }, [](auto &&acc, auto const &item) constexpr noexcept {
51+
return acc + item.num_rows();
52+
});
53+
}
54+
4855
[[nodiscard]] std::chrono::time_point<std::chrono::steady_clock> timestamp() noexcept {
4956
return std::chrono::steady_clock::now();
5057
}
@@ -114,82 +121,92 @@ int main(int argc, char **argv) {
114121
SinkInfoDetails sinkDetails = make_writer(outputFile, tableMetadata, s3client);
115122
auto &writer = *sinkDetails.writer;
116123

117-
SPDLOG_INFO("Start reading files");
124+
SPDLOG_INFO("Starting compaction on {:d} files containing {:d} total rows", inputFiles.size(), totalRows);
118125
// Remaining parts initially empty
119-
std::vector<std::unique_ptr<cudf::table>> remainingParts{ readers.size() };
120-
std::size_t lastTotalRowCount = std::numeric_limits<std::size_t>::max();
126+
std::vector<std::unique_ptr<cudf::table>> tables{ readers.size() };
127+
std::size_t rowsInMemory = std::numeric_limits<std::size_t>::max();
128+
std::size_t rowsWritten = 0;
121129
auto const startTime = timestamp();
122130
// Loop doing reads
123-
while (lastTotalRowCount) {
124-
lastTotalRowCount = 0;
131+
while (rowsInMemory) {
132+
rowsInMemory = 0;
125133
// Loop through each reader
126134
for (std::size_t rc = 0; auto &[src, reader, chunkNo, rowCount] : readers) {
135+
auto &oldTable = tables[rc];
127136
// If reader has data and we need some, perform a read
128137
SPDLOG_INFO("Reader {:d}", rc);
129138
if (reader->has_next()) {
130-
SPDLOG_INFO(" Reader has rows");
131-
if (!remainingParts[rc] || remainingParts[rc]->num_rows() < epsilon) {
132-
SPDLOG_INFO(
133-
" No previous table or we only have {:d} in memory", remainingParts[rc]->num_rows());
134-
139+
SPDLOG_INFO(" Reader has rows");
140+
if (!oldTable || oldTable->num_rows() < epsilon) {
141+
if (oldTable) {
142+
SPDLOG_INFO(" We only have {:d} in memory", oldTable->num_rows());
143+
} else {
144+
SPDLOG_INFO(" No previous data in memory");
145+
}
135146
// Read a chunk
136147
SPDLOG_INFO(" Read chunk: {:d}", chunkNo);
137-
auto table = reader->read_chunk();
138-
auto const rowsInChunk = table.metadata.num_rows_per_source.at(0);
148+
auto readTable = reader->read_chunk();
149+
auto const rowsInChunk = readTable.metadata.num_rows_per_source.at(0);
139150
SPDLOG_INFO(" Read chunk of {:d} rows", rowsInChunk);
140151
// Increment chunk number in reader and add to row count
141152
chunkNo++;
142153
rowCount += rowsInChunk;
143154

144155
// Now concat the old part to the new chunk
145156
std::unique_ptr<cudf::table> concat =
146-
cudf::concatenate(std::vector{ remainingParts[rc]->view(), table.tbl->view() });
147-
remainingParts[rc] = std::move(concat);
148-
SPDLOG_INFO(" New table has {:d} rows", remainingParts[rc]->num_rows());
157+
(oldTable) ? cudf::concatenate(std::vector{ oldTable->view(), readTable.tbl->view() })
158+
: std::move(readTable.tbl);
159+
oldTable = std::move(concat);
160+
SPDLOG_INFO(" New table has {:d} rows", tables[rc]->num_rows());
149161
}
150162
} else {
151163
SPDLOG_INFO(" Reader {:d} has no more rows", rc);
152164
}
153165

154166
// Update overall count
155-
lastTotalRowCount += remainingParts[rc]->num_rows();
167+
rowsInMemory += oldTable->num_rows();
156168
rc++;
157169
}
158170

171+
SPDLOG_INFO("There are {:d} rows to process", rowsInMemory);
159172
// Merge and write tables
160-
if (lastTotalRowCount > 0) {
173+
if (rowsInMemory > 0) {
161174
// Find the least upper bound in sort column across these tables
162-
auto const leastUpperBound = findLeastUpperBound(remainingParts, 0);
175+
auto const leastUpperBound = findLeastUpperBound(tables, 0);
163176

164177
// Now take search "needle" from last row from of table with LUB
165-
auto const lubTable = remainingParts[leastUpperBound]->select({ 0 });
178+
auto const lubTable = tables[leastUpperBound]->select({ 0 });
166179
auto const needle = cudf::split(lubTable, { lubTable.num_rows() - 1 })[1];
167180

168181
// Split all tables at the needle
169182
std::pair<std::vector<cudf::table_view>, std::vector<cudf::table_view>> const tableVectors =
170-
splitAtNeedle(needle, remainingParts);
183+
splitAtNeedle(needle, tables);
171184

172185
// Merge all the upper parts of the tables
173-
SPDLOG_INFO("Merging {:d} rows", lastTotalRowCount);
186+
std::size_t rowsToWrite = calcRowsInViews(tableVectors.first);
187+
SPDLOG_INFO("Merging {:d} rows", rowsToWrite);
174188
auto merged = cudf::merge(tableVectors.first, { 0 }, { cudf::order::ASCENDING });
175189

176190
// Duplicate the unmerged parts of the tables, so we can opportunistically clear the original
177191
// tables we no longer need
178-
for (std::size_t idx = 0; auto &&table : remainingParts) {
192+
for (std::size_t idx = 0; auto &&table : tables) {
179193
table = std::make_unique<cudf::table>(tableVectors.second[idx]);
180194
idx++;
181195
}
182196

183197
writer.write(*merged);
198+
rowsWritten += rowsToWrite;
184199

185200
auto const elapsedTime = std::chrono::duration_cast<std::chrono::seconds>(timestamp() - startTime);
186-
auto const rowsWritten = calcRowsWritten(readers);
187-
auto const fracRowsWritten = (static_cast<double>(rowsWritten) / totalRows);
201+
auto const rowsRead = calcRowsRead(readers);
202+
auto const fracRowsRead = (static_cast<double>(rowsRead) / totalRows);
188203
auto const predictedTime =
189-
std::chrono::duration_cast<std::chrono::seconds>(elapsedTime * (1 / fracRowsWritten));
190-
SPDLOG_INFO("Written {:d} rows, {:.2f}% complete, est. time (total) {:02d}:{:02d} ({:02d}:{:02d})",
204+
std::chrono::duration_cast<std::chrono::seconds>(elapsedTime * (1 / fracRowsRead));
205+
SPDLOG_INFO(
206+
"Read {:d} rows, Wrote {:d} rows, {:.2f}% complete, est. time (total) {:02d}:{:02d} ({:02d}:{:02d})",
207+
rowsRead,
191208
rowsWritten,
192-
fracRowsWritten * 100,
209+
fracRowsRead * 100,
193210
elapsedTime.count() / 60,
194211
elapsedTime.count() % 60,
195212
predictedTime.count() / 60,
@@ -199,9 +216,6 @@ int main(int argc, char **argv) {
199216

200217
writer.close();
201218

202-
// Grab total row count from each reader
203-
auto const rowsWritten = calcRowsWritten(readers);
204-
205219
SPDLOG_INFO("Finished, read/wrote {:d} rows from {:d} readers", rowsWritten, inputFiles.size());
206220
}
207221
gpu_compact::s3::shutdownAWS();

0 commit comments

Comments
 (0)