Skip to content

Commit

Permalink
Encoding shuffle: implement faster algorithm
Browse files Browse the repository at this point in the history
Many thanks to Wojciech Muła for the core algorithm:
  http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html
  • Loading branch information
aklomp committed Apr 24, 2016
1 parent 4489686 commit 35109ad
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 60 deletions.
67 changes: 34 additions & 33 deletions lib/codec_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,39 +35,40 @@ enc_reshuffle (__m256i in)
0, 1, 2, -1,
3, 4, 5, -1));

// For each 32-bit word, reorder to bigendian, duplicating the third
// byte in every block of four:
in = _mm256_shuffle_epi8(in, _mm256_setr_epi8(
2, 2, 1, 0,
5, 5, 4, 3,
8, 8, 7, 6,
11, 11, 10, 9,
2, 2, 1, 0,
5, 5, 4, 3,
8, 8, 7, 6,
11, 11, 10, 9));

// Mask to pass through only the lower 6 bits of one byte:
__m256i mask = _mm256_set1_epi32(0x3F000000);

// Shift bits by 2, mask in only the first byte:
__m256i out = _mm256_and_si256(_mm256_srli_epi32(in, 2), mask);
mask = _mm256_srli_epi32(mask, 8);

// Shift bits by 4, mask in only the second byte:
out = _mm256_or_si256(out, _mm256_and_si256(_mm256_srli_epi32(in, 4), mask));
mask = _mm256_srli_epi32(mask, 8);

// Shift bits by 6, mask in only the third byte:
out = _mm256_or_si256(out, _mm256_and_si256(_mm256_srli_epi32(in, 6), mask));
mask = _mm256_srli_epi32(mask, 8);

// No shift necessary for the fourth byte because we duplicated the
// third byte to this position; just mask:
out = _mm256_or_si256(out, _mm256_and_si256(in, mask));

// Reorder to 32-bit little-endian:
return _mm256_bswap_epi32(out);
// Slice into 32-bit chunks and operate on all chunks in parallel.
// All processing is done within the 32-bit chunk. First, shuffle:
// before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
// after: [00000000|aaaaaabb|bbbbcccc|ccdddddd]
in = _mm256_shuffle_epi8(in, _mm256_set_epi8(
-1, 9, 10, 11,
-1, 6, 7, 8,
-1, 3, 4, 5,
-1, 0, 1, 2,
-1, 9, 10, 11,
-1, 6, 7, 8,
-1, 3, 4, 5,
-1, 0, 1, 2));

// cd = [00000000|00000000|0000cccc|ccdddddd]
const __m256i cd = _mm256_and_si256(in, _mm256_set1_epi32(0x00000FFF));

// ab = [0000aaaa|aabbbbbb|00000000|00000000]
const __m256i ab = _mm256_and_si256(_mm256_slli_epi32(in, 4), _mm256_set1_epi32(0x0FFF0000));

// merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd]
const __m256i merged = _mm256_or_si256(ab, cd);

// bd = [00000000|00bbbbbb|00000000|00dddddd]
const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F));

// ac = [00aaaaaa|00000000|00cccccc|00000000]
const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00));

// indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
const __m256i indices = _mm256_or_si256(ac, bd);

// return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
return _mm256_bswap_epi32(indices);
}

static inline __m256i
Expand Down
52 changes: 25 additions & 27 deletions lib/codec_ssse3.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,38 +25,36 @@ _mm_bswap_epi32 (const __m128i in)
static inline __m128i
enc_reshuffle (__m128i in)
{
// Reorder to 32-bit big-endian, duplicating the third byte in every
// block of four. This copies the third byte to its final destination,
// so we can include it later by just masking instead of shifting and
// masking. The workset must be in big-endian, otherwise the shifted
// bits do not carry over properly among adjacent bytes:
in = _mm_shuffle_epi8(in, _mm_setr_epi8(
2, 2, 1, 0,
5, 5, 4, 3,
8, 8, 7, 6,
11, 11, 10, 9));

// Mask to pass through only the lower 6 bits of one byte:
__m128i mask = _mm_set1_epi32(0x3F000000);
// Slice into 32-bit chunks and operate on all chunks in parallel.
// All processing is done within the 32-bit chunk. First, shuffle:
// before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb]
// after: [00000000|aaaaaabb|bbbbcccc|ccdddddd]
in = _mm_shuffle_epi8(in, _mm_set_epi8(
-1, 9, 10, 11,
-1, 6, 7, 8,
-1, 3, 4, 5,
-1, 0, 1, 2));

// Shift bits by 2, mask in only the first byte:
__m128i out = _mm_and_si128(_mm_srli_epi32(in, 2), mask);
mask = _mm_srli_epi32(mask, 8);
// cd = [00000000|00000000|0000cccc|ccdddddd]
const __m128i cd = _mm_and_si128(in, _mm_set1_epi32(0x00000FFF));

// Shift bits by 4, mask in only the second byte:
out = _mm_or_si128(out, _mm_and_si128(_mm_srli_epi32(in, 4), mask));
mask = _mm_srli_epi32(mask, 8);
// ab = [0000aaaa|aabbbbbb|00000000|00000000]
const __m128i ab = _mm_and_si128(_mm_slli_epi32(in, 4), _mm_set1_epi32(0x0FFF0000));

// Shift bits by 6, mask in only the third byte:
out = _mm_or_si128(out, _mm_and_si128(_mm_srli_epi32(in, 6), mask));
mask = _mm_srli_epi32(mask, 8);
// merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd]
const __m128i merged = _mm_or_si128(ab, cd);

// bd = [00000000|00bbbbbb|00000000|00dddddd]
const __m128i bd = _mm_and_si128(merged, _mm_set1_epi32(0x003F003F));

// ac = [00aaaaaa|00000000|00cccccc|00000000]
const __m128i ac = _mm_and_si128(_mm_slli_epi32(merged, 2), _mm_set1_epi32(0x3F003F00));

// No shift necessary for the fourth byte because we duplicated
// the third byte to this position; just mask:
out = _mm_or_si128(out, _mm_and_si128(in, mask));
// indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd]
const __m128i indices = _mm_or_si128(ac, bd);

// Reorder to 32-bit little-endian and return:
return _mm_bswap_epi32(out);
// return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa]
return _mm_bswap_epi32(indices);
}

static inline __m128i
Expand Down

0 comments on commit 35109ad

Please sign in to comment.