Tricks for Bitwise Interleaving and Deinterleaving
概要
ビット単位の Interleave を効率化する手法を見つけたので,単純な実装と比べてみました.
- Interleave bits by Binary Magic Numbers - Bit Twiddling Hacks
Deinterleave については,Interleave の演算を概ね逆向きに適用することで実現できます.
ちなみに,ビット単位の Interleave は groonga の位置情報検索に使われています.
- groongaで高速な位置情報検索 - ククログ(2011-09-13)
比較の結果をまとめると,Interleave/Deinterleave にかかる時間を 1/10 にできました.また,groonga 付属のベンチマークによると,位置情報検索にかかる時間は 9/10 くらいになるようです.
環境
$ clang -v clang version 3.2 (tags/RELEASE_32/final) Target: x86_64-unknown-linux-gnu Thread model: posix
ビルド方法
$ clang++ -std=c++11 -Wall -O2 benchmark.cpp
実行結果
$ ./a.out Generating points: 1063000 [ns] Generating values: 4919000 [ns] Testing fast implementations: 925000 [ns] naive_interleave: 2169113000 [ns] -> average 129 [ns] fast_interleave: 195374000 [ns] -> average 11 [ns] naive_interleave: 2036491000 [ns] -> average 121 [ns] fast_interleave: 244196000 [ns] -> average 14 [ns] naive_interleave/deinterleave: 4202892000 [ns] -> average 250 [ns] fast_interleave/deinterleave: 482340000 [ns] -> average 28 [ns]
ソースコード
#include <cassert> #include <chrono> #include <cstdint> #include <iostream> #include <random> using namespace std; namespace { class Stopwatch { public: Stopwatch() : base_(now()) {} void reset() { base_ = now(); } int64_t elapsed() const { return now() - base_; } private: int64_t base_; static int64_t now() { return chrono::duration_cast<chrono::nanoseconds>( chrono::system_clock::now() - chrono::system_clock::from_time_t(0)).count(); } }; struct Point { int32_t x; int32_t y; }; uint64_t naive_interleave(const Point &point) { const uint64_t x = static_cast<uint32_t>(point.x); const uint64_t y = static_cast<uint32_t>(point.y); uint64_t value = 0; for (int i = 0; i < 32; ++i) { value |= (x & (1ULL << i)) << i; value |= (y & (1ULL << i)) << (i + 1); } return value; } Point naive_deinterleave(uint64_t value) { uint64_t x = 0; uint64_t y = 0; for (int i = 0; i < 32; ++i) { x |= (value & (1ULL << (i * 2))) >> i; y |= (value & (1ULL << ((i * 2) + 1))) >> (i + 1); } return Point{ static_cast<int32_t>(x), static_cast<int32_t>(y) }; } uint64_t fast_interleave(const Point &point) { uint64_t x = static_cast<uint32_t>(point.x); x = (x | (x << 16)) & 0x0000FFFF0000FFFFULL; x = (x | (x << 8)) & 0x00FF00FF00FF00FFULL; x = (x | (x << 4)) & 0x0F0F0F0F0F0F0F0FULL; x = (x | (x << 2)) & 0x3333333333333333ULL; x = (x | (x << 1)) & 0x5555555555555555ULL; uint64_t y = static_cast<uint32_t>(point.y); y = (y | (y << 16)) & 0x0000FFFF0000FFFFULL; y = (y | (y << 8)) & 0x00FF00FF00FF00FFULL; y = (y | (y << 4)) & 0x0F0F0F0F0F0F0F0FULL; y = (y | (y << 2)) & 0x3333333333333333ULL; y = (y | (y << 1)) & 0x5555555555555555ULL; return x | (y << 1); } Point fast_deinterleave(uint64_t value) { uint64_t x = value & 0x5555555555555555ULL; x = (x | (x >> 1)) & 0x3333333333333333ULL; x = (x | (x >> 2)) & 0x0F0F0F0F0F0F0F0FULL; x = (x | (x >> 4)) & 0x00FF00FF00FF00FFULL; x = (x | (x >> 8)) & 0x0000FFFF0000FFFFULL; x = (x | (x >> 16)) & 0x00000000FFFFFFFFULL; uint64_t y = (value >> 1) & 0x5555555555555555ULL; y = (y | (y >> 1)) & 0x3333333333333333ULL; y = (y | (y >> 2)) & 0x0F0F0F0F0F0F0F0FULL; y = (y | (y >> 4)) & 0x00FF00FF00FF00FFULL; y = (y | (y >> 8)) & 0x0000FFFF0000FFFFULL; y = (y | (y >> 16)) & 0x00000000FFFFFFFFULL; return Point{ static_cast<int32_t>(x), static_cast<int32_t>(y) }; } } // namespace int main() { constexpr size_t NUM_POINTS = 1 << 14; constexpr size_t LOOP_COUNT = 1 << 10; constexpr size_t TOTAL_COUNT = NUM_POINTS * LOOP_COUNT; Stopwatch stopwatch; int64_t elapsed; cout << "Generating points: " << flush; stopwatch.reset(); mt19937 rng; vector<Point> points(NUM_POINTS); for (size_t i = 0; i < NUM_POINTS; ++i) { do { points[i].x = rng(); points[i].y = rng(); } while ((points[i].x == 0) && (points[i].y == 0)); } cout << stopwatch.elapsed() << " [ns]" << endl; cout << "Generating values: " << flush; stopwatch.reset(); vector<uint64_t> values(NUM_POINTS); for (size_t i = 0; i < NUM_POINTS; ++i) { values[i] = naive_interleave(points[i]); assert(values[i] != 0); } cout << stopwatch.elapsed() << " [ns]" << endl; cout << "Testing fast implementations: " << flush; stopwatch.reset(); for (size_t i = 0; i < NUM_POINTS; ++i) { const uint64_t value = fast_interleave(points[i]); assert(value == values[i]); const Point point = fast_deinterleave(value); assert(point.x == points[i].x); assert(point.y == points[i].y); } cout << stopwatch.elapsed() << " [ns]" << endl; cout << "naive_interleave: " << flush; stopwatch.reset(); for (size_t i = 0; i < LOOP_COUNT; ++i) { for (size_t j = 0; j < NUM_POINTS; ++j) { const uint64_t value = naive_interleave(points[j]); assert(value == values[j]); } } elapsed = stopwatch.elapsed(); cout << elapsed << " [ns] -> average " << (elapsed / TOTAL_COUNT) << " [ns]" << endl; cout << "fast_interleave: "; stopwatch.reset(); for (size_t i = 0; i < LOOP_COUNT; ++i) { for (size_t j = 0; j < NUM_POINTS; ++j) { const uint64_t value = fast_interleave(points[j]); assert(value == values[j]); } } elapsed = stopwatch.elapsed(); cout << elapsed << " [ns] -> average " << (elapsed / TOTAL_COUNT) << " [ns]" << endl; cout << "naive_interleave: " << flush; stopwatch.reset(); for (size_t i = 0; i < LOOP_COUNT; ++i) { for (size_t j = 0; j < NUM_POINTS; ++j) { const Point point = naive_deinterleave(values[j]); assert(point.x == points[j].x); assert(point.y == points[j].y); } } elapsed = stopwatch.elapsed(); cout << elapsed << " [ns] -> average " << (elapsed / TOTAL_COUNT) << " [ns]" << endl; cout << "fast_interleave: " << flush; stopwatch.reset(); for (size_t i = 0; i < LOOP_COUNT; ++i) { for (size_t j = 0; j < NUM_POINTS; ++j) { const Point point = fast_deinterleave(values[j]); assert(point.x == points[j].x); assert(point.y == points[j].y); } } elapsed = stopwatch.elapsed(); cout << elapsed << " [ns] -> average " << (elapsed / TOTAL_COUNT) << " [ns]" << endl; cout << "naive_interleave/deinterleave: " << flush; stopwatch.reset(); for (size_t i = 0; i < LOOP_COUNT; ++i) { for (size_t j = 0; j < NUM_POINTS; ++j) { const uint64_t value = naive_interleave(points[j]); const Point point = naive_deinterleave(value); assert(point.x == points[j].x); assert(point.y == points[j].y); } } elapsed = stopwatch.elapsed(); cout << elapsed << " [ns] -> average " << (elapsed / TOTAL_COUNT) << " [ns]" << endl; cout << "fast_interleave/deinterleave: " << flush; stopwatch.reset(); for (size_t i = 0; i < LOOP_COUNT; ++i) { for (size_t j = 0; j < NUM_POINTS; ++j) { const uint64_t value = fast_interleave(points[j]); const Point point = fast_deinterleave(value); assert(point.x == points[j].x); assert(point.y == points[j].y); } } elapsed = stopwatch.elapsed(); cout << elapsed << " [ns] -> average " << (elapsed / TOTAL_COUNT) << " [ns]" << endl; return 0; }