Tricks for Bitwise Interleaving and Deinterleaving

概要

ビット単位の Interleave を効率化する手法を見つけたので,単純な実装と比べてみました.

Deinterleave については,Interleave の演算を概ね逆向きに適用することで実現できます.

ちなみに,ビット単位の Interleave は groonga の位置情報検索に使われています.

比較の結果をまとめると,Interleave/Deinterleave にかかる時間を 1/10 にできました.また,groonga 付属のベンチマークによると,位置情報検索にかかる時間は 9/10 くらいになるようです.

環境

$ clang -v
clang version 3.2 (tags/RELEASE_32/final)
Target: x86_64-unknown-linux-gnu
Thread model: posix

ビルド方法

$ clang++ -std=c++11 -Wall -O2 benchmark.cpp

実行結果

$ ./a.out 
Generating points: 1063000 [ns]
Generating values: 4919000 [ns]
Testing fast implementations: 925000 [ns]
naive_interleave: 2169113000 [ns] -> average 129 [ns]
fast_interleave: 195374000 [ns] -> average 11 [ns]
naive_interleave: 2036491000 [ns] -> average 121 [ns]
fast_interleave: 244196000 [ns] -> average 14 [ns]
naive_interleave/deinterleave: 4202892000 [ns] -> average 250 [ns]
fast_interleave/deinterleave: 482340000 [ns] -> average 28 [ns]

ソースコード

#include <cassert>
#include <chrono>
#include <cstdint>
#include <iostream>
#include <random>

using namespace std;

namespace {

class Stopwatch {
 public:
  Stopwatch() : base_(now()) {}

  void reset() {
    base_ = now();
  }

  int64_t elapsed() const {
    return now() - base_;
  }

 private:
  int64_t base_;

  static int64_t now() {
    return chrono::duration_cast<chrono::nanoseconds>(
               chrono::system_clock::now() -
               chrono::system_clock::from_time_t(0)).count();
  }
};

struct Point {
  int32_t x;
  int32_t y;
};

uint64_t naive_interleave(const Point &point) {
  const uint64_t x = static_cast<uint32_t>(point.x);
  const uint64_t y = static_cast<uint32_t>(point.y);
  uint64_t value = 0;
  for (int i = 0; i < 32; ++i) {
    value |= (x & (1ULL << i)) << i;
    value |= (y & (1ULL << i)) << (i + 1);
  }
  return value;
}

Point naive_deinterleave(uint64_t value) {
  uint64_t x = 0;
  uint64_t y = 0;
  for (int i = 0; i < 32; ++i) {
    x |= (value & (1ULL << (i * 2))) >> i;
    y |= (value & (1ULL << ((i * 2) + 1))) >> (i + 1);
  }
  return Point{ static_cast<int32_t>(x), static_cast<int32_t>(y) };
}

uint64_t fast_interleave(const Point &point) {
  uint64_t x = static_cast<uint32_t>(point.x);
  x = (x | (x << 16)) & 0x0000FFFF0000FFFFULL;
  x = (x | (x <<  8)) & 0x00FF00FF00FF00FFULL;
  x = (x | (x <<  4)) & 0x0F0F0F0F0F0F0F0FULL;
  x = (x | (x <<  2)) & 0x3333333333333333ULL;
  x = (x | (x <<  1)) & 0x5555555555555555ULL;
  uint64_t y = static_cast<uint32_t>(point.y);
  y = (y | (y << 16)) & 0x0000FFFF0000FFFFULL;
  y = (y | (y <<  8)) & 0x00FF00FF00FF00FFULL;
  y = (y | (y <<  4)) & 0x0F0F0F0F0F0F0F0FULL;
  y = (y | (y <<  2)) & 0x3333333333333333ULL;
  y = (y | (y <<  1)) & 0x5555555555555555ULL;
  return x | (y << 1);
}

Point fast_deinterleave(uint64_t value) {
  uint64_t x = value & 0x5555555555555555ULL;
  x = (x | (x >>  1)) & 0x3333333333333333ULL;
  x = (x | (x >>  2)) & 0x0F0F0F0F0F0F0F0FULL;
  x = (x | (x >>  4)) & 0x00FF00FF00FF00FFULL;
  x = (x | (x >>  8)) & 0x0000FFFF0000FFFFULL;
  x = (x | (x >> 16)) & 0x00000000FFFFFFFFULL;
  uint64_t y = (value >> 1) & 0x5555555555555555ULL;
  y = (y | (y >>  1)) & 0x3333333333333333ULL;
  y = (y | (y >>  2)) & 0x0F0F0F0F0F0F0F0FULL;
  y = (y | (y >>  4)) & 0x00FF00FF00FF00FFULL;
  y = (y | (y >>  8)) & 0x0000FFFF0000FFFFULL;
  y = (y | (y >> 16)) & 0x00000000FFFFFFFFULL;
  return Point{ static_cast<int32_t>(x), static_cast<int32_t>(y) };
}

}  // namespace

int main() {
  constexpr size_t NUM_POINTS = 1 << 14;
  constexpr size_t LOOP_COUNT = 1 << 10;
  constexpr size_t TOTAL_COUNT = NUM_POINTS * LOOP_COUNT;

  Stopwatch stopwatch;
  int64_t elapsed;

  cout << "Generating points: " << flush;
  stopwatch.reset();
  mt19937 rng;
  vector<Point> points(NUM_POINTS);
  for (size_t i = 0; i < NUM_POINTS; ++i) {
    do {
      points[i].x = rng();
      points[i].y = rng();
    } while ((points[i].x == 0) && (points[i].y == 0));
  }
  cout << stopwatch.elapsed() << " [ns]" << endl;

  cout << "Generating values: " << flush;
  stopwatch.reset();
  vector<uint64_t> values(NUM_POINTS);
  for (size_t i = 0; i < NUM_POINTS; ++i) {
    values[i] = naive_interleave(points[i]);
    assert(values[i] != 0);
  }
  cout << stopwatch.elapsed() << " [ns]" << endl;

  cout << "Testing fast implementations: " << flush;
  stopwatch.reset();
  for (size_t i = 0; i < NUM_POINTS; ++i) {
    const uint64_t value = fast_interleave(points[i]);
    assert(value == values[i]);
    const Point point = fast_deinterleave(value);
    assert(point.x == points[i].x);
    assert(point.y == points[i].y);
  }
  cout << stopwatch.elapsed() << " [ns]" << endl;

  cout << "naive_interleave: " << flush;
  stopwatch.reset();
  for (size_t i = 0; i < LOOP_COUNT; ++i) {
    for (size_t j = 0; j < NUM_POINTS; ++j) {
      const uint64_t value = naive_interleave(points[j]);
      assert(value == values[j]);
    }
  }
  elapsed = stopwatch.elapsed();
  cout << elapsed << " [ns] -> average "
            << (elapsed / TOTAL_COUNT) << " [ns]" << endl;

  cout << "fast_interleave: ";
  stopwatch.reset();
  for (size_t i = 0; i < LOOP_COUNT; ++i) {
    for (size_t j = 0; j < NUM_POINTS; ++j) {
      const uint64_t value = fast_interleave(points[j]);
      assert(value == values[j]);
    }
  }
  elapsed = stopwatch.elapsed();
  cout << elapsed << " [ns] -> average "
            << (elapsed / TOTAL_COUNT) << " [ns]" << endl;

  cout << "naive_interleave: " << flush;
  stopwatch.reset();
  for (size_t i = 0; i < LOOP_COUNT; ++i) {
    for (size_t j = 0; j < NUM_POINTS; ++j) {
      const Point point = naive_deinterleave(values[j]);
      assert(point.x == points[j].x);
      assert(point.y == points[j].y);
    }
  }
  elapsed = stopwatch.elapsed();
  cout << elapsed << " [ns] -> average "
            << (elapsed / TOTAL_COUNT) << " [ns]" << endl;

  cout << "fast_interleave: " << flush;
  stopwatch.reset();
  for (size_t i = 0; i < LOOP_COUNT; ++i) {
    for (size_t j = 0; j < NUM_POINTS; ++j) {
      const Point point = fast_deinterleave(values[j]);
      assert(point.x == points[j].x);
      assert(point.y == points[j].y);
    }
  }
  elapsed = stopwatch.elapsed();
  cout << elapsed << " [ns] -> average "
            << (elapsed / TOTAL_COUNT) << " [ns]" << endl;

  cout << "naive_interleave/deinterleave: " << flush;
  stopwatch.reset();
  for (size_t i = 0; i < LOOP_COUNT; ++i) {
    for (size_t j = 0; j < NUM_POINTS; ++j) {
      const uint64_t value = naive_interleave(points[j]);
      const Point point = naive_deinterleave(value);
      assert(point.x == points[j].x);
      assert(point.y == points[j].y);
    }
  }
  elapsed = stopwatch.elapsed();
  cout << elapsed << " [ns] -> average "
            << (elapsed / TOTAL_COUNT) << " [ns]" << endl;

  cout << "fast_interleave/deinterleave: " << flush;
  stopwatch.reset();
  for (size_t i = 0; i < LOOP_COUNT; ++i) {
    for (size_t j = 0; j < NUM_POINTS; ++j) {
      const uint64_t value = fast_interleave(points[j]);
      const Point point = fast_deinterleave(value);
      assert(point.x == points[j].x);
      assert(point.y == points[j].y);
    }
  }
  elapsed = stopwatch.elapsed();
  cout << elapsed << " [ns] -> average "
            << (elapsed / TOTAL_COUNT) << " [ns]" << endl;

  return 0;
}