Skip to content

Commit 746cdfa

Browse files
authored
Significantly improve performance of PalettedBlockArray::validate() (#35)
This change introduces a fast path for validation using carry-out vectors, as discussed here: https://devblogs.microsoft.com/oldnewthing/20190301-00/?p=101076 TL;DR: It's possible to detect invalid palette offsets by subtracting the current "word" from a bitfield composed of the max valid palette offset, which is significantly faster than shifting and naively comparing each one. In addition, we allow the code to chew through the whole loop even if invalid offsets are detected, in the hope that compilers will use SIMD instructions and/or parallelize the loop, which is also substantially faster than breaking out early on a branch condition to throw an exception. The downside of the fast path is that it cannot report the exact offset an error occurred at, only that an error is present. For this reason, the old, slower code is retained, which allows generating detailed errors if an error is detected. Note: This does cause a change of behaviour for palettes with padded words (3, 5, and 6 bits per block). The palette is now validated to be zero as a side effect of this change. If non-zero padding is found, the fast path will fail, and fallback to the slow path, which will ignore the error. We may want to explicitly ignore or force padding to be zero.
1 parent e2d6606 commit 746cdfa

File tree

2 files changed

+51
-9
lines changed

2 files changed

+51
-9
lines changed

config.w32

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ ARG_ENABLE("chunkutils2", "Enable PocketMine ChunkUtils2 extension", "no");
55

66
if (PHP_CHUNKUTILS2 != "no") {
77
EXTENSION("chunkutils2", "chunkutils2.cpp", PHP_CHUNKUTILS2_SHARED, "/DZEND_ENABLE_STATIC_TSRMLS_CACHE=1 /permissive- /I" + configure_module_dirname + " /I" + configure_module_dirname + "/gsl/include /DGSL_THROW_ON_CONTRACT_VIOLATION=1");
8-
ADD_FLAG("CFLAGS_CHUNKUTILS2", "/EHsc");
8+
ADD_FLAG("CFLAGS_CHUNKUTILS2", "/EHsc /Qvec-report:1");
99
ADD_SOURCES(
1010
configure_module_dirname + "/src",
1111
"PhpLightArray.cpp PhpPalettedBlockArray.cpp PhpSubChunkConverter.cpp",

lib/PalettedBlockArray.h

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,8 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
118118
words[wordIdx] = (words[wordIdx] & ~(BLOCK_MASK << shift)) | (offset << shift);
119119
}
120120

121-
void validate() const {
122-
if (MAX_PALETTE_OFFSET == MAX_PALETTE_SIZE && nextPaletteIndex >= MAX_PALETTE_SIZE) {
123-
//Every possible offset representable is valid, therefore no validation is required
124-
//this is an uncommon case, but more frequent in small palettes, which is a win because small palettes are more
125-
//expensive to verify than big ones due to more bitwise operations needed to extract the offsets
126-
//for 16 bpb, offsets may point beyond the end of the palette even at full capacity, so they must always be checked
127-
return;
128-
}
121+
void locateAndReportInvalidOffset() const {
122+
//Slow path, to allow giving detailed errors when a problem has already been detected by the fast path
129123
auto blockCount = 0;
130124
for (auto wordIdx = 0; wordIdx < words.size(); wordIdx++) {
131125
const auto word = words[wordIdx];
@@ -150,6 +144,54 @@ class PalettedBlockArray final : public IPalettedBlockArray<Block> {
150144
}
151145
}
152146

147+
#ifdef _MSC_VER
148+
//If this function gets inlined, MSVC 16.29 does not want to vectorize it :(
149+
__declspec(noinline)
150+
#endif
151+
bool fastValidate() const {
152+
Word invalid = 0;
153+
154+
Word expected = 0;
155+
for (unsigned int shift = 0; shift < BLOCKS_PER_WORD * BITS_PER_BLOCK_INT; shift += BITS_PER_BLOCK_INT) {
156+
expected |= ((nextPaletteIndex - 1) << shift);
157+
}
158+
159+
//Fast path - use carry-out vectors to detect invalid offsets
160+
//This trick is borrowed from https://devblogs.microsoft.com/oldnewthing/20190301-00/?p=101076
161+
//We can't detect exactly where the error is with this approach, but we leave that up to the slow code if we detect an error
162+
for (auto wordIdx = 0; wordIdx < words.size(); wordIdx++) {
163+
const auto word = words[wordIdx];
164+
165+
if (BITS_PER_BLOCK_INT == 1) {
166+
//For 1 bits-per-block, we can only reach this point if the palette isn't full, which means that only offset 0 is set.
167+
//This means we can save a few instructions and just check the word directly for non-zero bits.
168+
invalid |= word;
169+
} else {
170+
Word carryVector = (~expected & word) | (~(expected ^ word) & (expected - word));
171+
invalid |= carryVector;
172+
}
173+
}
174+
175+
return invalid == 0;
176+
}
177+
178+
void validate() const {
179+
if (MAX_PALETTE_OFFSET == MAX_PALETTE_SIZE && nextPaletteIndex >= MAX_PALETTE_SIZE) {
180+
//Every possible offset representable is valid, therefore no validation is required
181+
//this is an uncommon case, but more frequent in small palettes, which is a win because small palettes are more
182+
//expensive to verify than big ones due to more bitwise operations needed to extract the offsets
183+
//for 16 bpb, offsets may point beyond the end of the palette even at full capacity, so they must always be checked
184+
return;
185+
}
186+
187+
if (!fastValidate()) {
188+
//If we detected an error, use the old, slow method to locate the (first) source of errors
189+
//this allows giving a detailed error message
190+
//if the palette is valid, we can avoid this slow path entirely, which is the most likely outcome.
191+
locateAndReportInvalidOffset();
192+
}
193+
}
194+
153195
public:
154196

155197
PalettedBlockArray(Block block) {

0 commit comments

Comments
 (0)