jc01rho의 일상잡다

비트열 뒤집기 - Bir reversal - MSB <-> LSB 스왑. 최적방법 본문

컴퓨터/C & C++

비트열 뒤집기 - Bir reversal - MSB <-> LSB 스왑. 최적방법

jc01rho 2012. 7. 8. 03:16

http://stackoverflow.com/questions/746171/best-algorithm-for-bit-reversal-from-msb-lsb-to-lsb-msb-in-c

 

What is the best algorithm to achieve the following:

0010 0000 => 0000 0100

The conversion is from MSB->LSB to LSB->MSB. All bits must be reversed; that is, this is not endianness-swapping.

 

 

 

NOTE: All algorithms below are in C, but should be portable to your language of choice (just don't look at me when they're not as fast :)

Options

Low Memory (32-bit int, 32-bit machine)(from here):

unsigned int 
reverse(register unsigned int x) 
{ 
    x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1)); 
    x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2)); 
    x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4)); 
    x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8)); 
    return((x >> 16) | (x << 16)); 
 
} 

From the famous Bit Twiddling Hacks page:

Fastest (lookup table):

static const unsigned char BitReverseTable256[] =  
{ 
  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,  
  0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,  
  0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,  
  0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,  
  0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,  
  0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 
  0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,  
  0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 
  0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, 
  0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,  
  0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 
  0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 
  0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,  
  0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 
  0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,  
  0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF 
}; 
 
unsigned int v; // reverse 32-bit value, 8 bits at time 
unsigned int c; // c will get v reversed 
 
// Option 1: 
c = (BitReverseTable256[v & 0xff] << 24) |  
    (BitReverseTable256[(v >> 8) & 0xff] << 16) |  
    (BitReverseTable256[(v >> 16) & 0xff] << 8) | 
    (BitReverseTable256[(v >> 24) & 0xff]); 
 
// Option 2: 
unsigned char * p = (unsigned char *) &v; 
unsigned char * q = (unsigned char *) &c; 
q[3] = BitReverseTable256[p[0]];  
q[2] = BitReverseTable256[p[1]];  
q[1] = BitReverseTable256[p[2]];  
q[0] = BitReverseTable256[p[3]]; 

You can extend this idea to 64-bit ints, or trade off memory for speed (assuming your L1 Data Cache is large enough), and reverse 16-bits at a time with a 64K-entry lookup table.


Others

Simple

unsigned int v;     // input bits to be reversed 
unsigned int r = v; // r will be reversed bits of v; first get LSB of v 
int s = sizeof(v) * CHAR_BIT - 1; // extra shift needed at end 
 
for (v >>= 1; v; v >>= 1) 
{    
  r <<= 1; 
  r |= v & 1; 
  s--; 
} 
r <<= s; // shift when v's highest bits are zero 

Faster (32-bit processor)

unsigned char b = x; 
b = ((b * 0x0802LU & 0x22110LU) | (b * 0x8020LU & 0x88440LU)) * 0x10101LU >> 16;  

Faster (64-bit processor)

unsigned char b; // reverse this (8-bit) byte 
b = (b * 0x0202020202ULL & 0x010884422010ULL) % 1023; 

If you want to do this on a 32-bit int, just reverse the bits in each bytes, and reverse the order of the bytes. That is:

unsigned int toReverse; 
unsigned int reversed; 
unsigned char inByte0 = (toReverse & 0xFF); 
unsigned char inByte1 = (toReverse & 0xFF00) >> 8; 
unsigned char inByte2 = (toReverse & 0xFF0000) >> 16; 
unsigned char inByte3 = (toReverse & 0xFF000000) >> 24; 
reversed = (reverseBits(inByte0) << 24) | (reverseBits(inByte1) << 16) | (reverseBits(inByte2) << 8) | (reverseBits(inByte3); 

Results

I benchmarked the two most promising solutions, the lookup table, and bitwise-AND (the first one). The test machine is a laptop w/ 4GB of DDR2-800 and a Core 2 Duo T7500 @ 2.4GHz, 4MB L2 Cache; YMMV. I used gcc 4.3.2 on 64-bit Linux. OpenMP (and the GCC bindings) were used for high-resolution timers.

reverse.c

#include <stdlib.h> 
#include <stdio.h> 
#include <omp.h> 
 
unsigned int 
reverse(register unsigned int x) 
{ 
    x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1)); 
    x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2)); 
    x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4)); 
    x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8)); 
    return((x >> 16) | (x << 16)); 
 
} 
 
int main() 
{ 
    unsigned int *ints = malloc(100000000*sizeof(unsigned int)); 
    unsigned int *ints2 = malloc(100000000*sizeof(unsigned int)); 
    for(unsigned int i = 0; i < 100000000; i++) 
      ints[i] = rand(); 
 
    unsigned int *inptr = ints; 
    unsigned int *outptr = ints2; 
    unsigned int *endptr = ints + 100000000; 
    // Starting the time measurement 
    double start = omp_get_wtime(); 
    // Computations to be measured 
    while(inptr != endptr) 
    { 
      (*outptr) = reverse(*inptr); 
      inptr++; 
      outptr++; 
    } 
    // Measuring the elapsed time 
    double end = omp_get_wtime(); 
    // Time calculation (in seconds) 
    printf("Time: %f seconds\n", end-start); 
 
    free(ints); 
    free(ints2); 
 
    return 0; 
} 

reverse_lookup.c

#include <stdlib.h> 
#include <stdio.h> 
#include <omp.h> 
 
static const unsigned char BitReverseTable256[] =  
{ 
  0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,  
  0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,  
  0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,  
  0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,  
  0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,  
  0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 
  0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,  
  0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 
  0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, 
  0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,  
  0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 
  0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 
  0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,  
  0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 
  0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,  
  0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF 
}; 
 
int main() 
{ 
    unsigned int *ints = malloc(100000000*sizeof(unsigned int)); 
    unsigned int *ints2 = malloc(100000000*sizeof(unsigned int)); 
    for(unsigned int i = 0; i < 100000000; i++) 
      ints[i] = rand(); 
 
    unsigned int *inptr = ints; 
    unsigned int *outptr = ints2; 
    unsigned int *endptr = ints + 100000000; 
    // Starting the time measurement 
    double start = omp_get_wtime(); 
    // Computations to be measured 
    while(inptr != endptr) 
    { 
    unsigned int in = *inptr;   
 
    // Option 1: 
    //*outptr = (BitReverseTable256[in & 0xff] << 24) |  
    //    (BitReverseTable256[(in >> 8) & 0xff] << 16) |  
    //    (BitReverseTable256[(in >> 16) & 0xff] << 8) | 
    //    (BitReverseTable256[(in >> 24) & 0xff]); 
 
    // Option 2: 
    unsigned char * p = (unsigned char *) &(*inptr); 
    unsigned char * q = (unsigned char *) &(*outptr); 
    q[3] = BitReverseTable256[p[0]];  
    q[2] = BitReverseTable256[p[1]];  
    q[1] = BitReverseTable256[p[2]];  
    q[0] = BitReverseTable256[p[3]]; 
 
      inptr++; 
      outptr++; 
    } 
    // Measuring the elapsed time 
    double end = omp_get_wtime(); 
    // Time calculation (in seconds) 
    printf("Time: %f seconds\n", end-start); 
 
    free(ints); 
    free(ints2); 
 
    return 0; 
} 

I tried both approaches at several different optimizations, ran 3 trials at each level, and each trial reversed 100 million random unsigned ints. For the lookup table option, I tried both schemes (options 1 and 2) given on the bitwise hacks page. Results are shown below.

Bitwise AND

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse reverse.c 
mrj10@mjlap:~/code$ ./reverse 
Time: 2.000593 seconds 
mrj10@mjlap:~/code$ ./reverse 
Time: 1.938893 seconds 
mrj10@mjlap:~/code$ ./reverse 
Time: 1.936365 seconds 
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse reverse.c 
mrj10@mjlap:~/code$ ./reverse 
Time: 0.942709 seconds 
mrj10@mjlap:~/code$ ./reverse 
Time: 0.991104 seconds 
mrj10@mjlap:~/code$ ./reverse 
Time: 0.947203 seconds 
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse reverse.c 
mrj10@mjlap:~/code$ ./reverse 
Time: 0.922639 seconds 
mrj10@mjlap:~/code$ ./reverse 
Time: 0.892372 seconds 
mrj10@mjlap:~/code$ ./reverse 
Time: 0.891688 seconds 

Lookup Table (option 1)

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse_lookup reverse_lookup.c 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.201127 seconds               
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.196129 seconds               
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.235972 seconds               
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse_lookup reverse_lookup.c 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 0.633042 seconds               
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 0.655880 seconds               
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 0.633390 seconds               
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse_lookup reverse_lookup.c 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 0.652322 seconds               
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 0.631739 seconds               
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 0.652431 seconds   

Lookup Table (option 2)

mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -o reverse_lookup reverse_lookup.c 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.671537 seconds 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.688173 seconds 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.664662 seconds 
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O2 -o reverse_lookup reverse_lookup.c 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.049851 seconds 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.048403 seconds 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.085086 seconds 
mrj10@mjlap:~/code$ gcc -fopenmp -std=c99 -O3 -o reverse_lookup reverse_lookup.c 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.082223 seconds 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.053431 seconds 
mrj10@mjlap:~/code$ ./reverse_lookup 
Time: 1.081224 seconds 

Conclusion

Use the lookup table, with option 1 (byte addressing is unsurprisingly slow) if you're concerned about performance. If you need to squeeze every last byte of memory out of your system (and you might, if you care about the performance of bit reversal), the optimized versions of the bitwise-AND approach aren't too shabby either.

Caveat

Yes, I know the benchmark code is a complete hack. Suggestions on how to improve it are more than welcome. Things I know about:

  • I don't have access to ICC. This may be faster (please respond in a comment if you can test this out).
  • A 64K lookup table may do well on some modern microarchitectures with large L1D.
  • -mtune=native didn't work for -O2/-O3 (ld blew up with some crazy symbol redefinition error), so I don't believe the generated code is tuned for my microarchitecture.
  • There may be a way to do this slightly faster with SSE. I have no idea how, but with fast replication, packed bitwise AND, and swizzling instructions, there's got to be something there.
  • I know only enough x86 assembly to be dangerous; here's the code GCC generated on -O3 for option 1, so somebody more knowledgable than myself can check it out:

32-bit

.L3: 
movl    (%r12,%rsi), %ecx 
movzbl  %cl, %eax 
movzbl  BitReverseTable256(%rax), %edx 
movl    %ecx, %eax 
shrl    $24, %eax 
mov     %eax, %eax 
movzbl  BitReverseTable256(%rax), %eax 
sall    $24, %edx 
orl     %eax, %edx 
movzbl  %ch, %eax 
shrl    $16, %ecx 
movzbl  BitReverseTable256(%rax), %eax 
movzbl  %cl, %ecx 
sall    $16, %eax 
orl     %eax, %edx 
movzbl  BitReverseTable256(%rcx), %eax 
sall    $8, %eax 
orl     %eax, %edx 
movl    %edx, (%r13,%rsi) 
addq    $4, %rsi 
cmpq    $400000000, %rsi 
jne     .L3 

EDIT: I also tried using uint64_t's on my machine to see if there was any performance boost. Performance was about 10% faster than 32-bit, and was nearly identical whether you were just using 64-bit types to reverse bits on two 32-bit ints at a time, or whether you were actually reversing bits in half as many 64-bit values. The assembly code is shown below (for the former case, reversing bits for 2 32-bit ints at a time):

.L3: 
movq    (%r12,%rsi), %rdx 
movq    %rdx, %rax 
shrq    $24, %rax 
andl    $255, %eax 
movzbl  BitReverseTable256(%rax), %ecx 
movzbq  %dl,%rax 
movzbl  BitReverseTable256(%rax), %eax 
salq    $24, %rax 
orq     %rax, %rcx 
movq    %rdx, %rax 
shrq    $56, %rax 
movzbl  BitReverseTable256(%rax), %eax 
salq    $32, %rax 
orq     %rax, %rcx 
movzbl  %dh, %eax 
shrq    $16, %rdx 
movzbl  BitReverseTable256(%rax), %eax 
salq    $16, %rax 
orq     %rax, %rcx 
movzbq  %dl,%rax 
shrq    $16, %rdx 
movzbl  BitReverseTable256(%rax), %eax 
salq    $8, %rax 
orq     %rax, %rcx 
movzbq  %dl,%rax 
shrq    $8, %rdx 
movzbl  BitReverseTable256(%rax), %eax 
salq    $56, %rax 
orq     %rax, %rcx 
movzbq  %dl,%rax 
shrq    $8, %rdx 
movzbl  BitReverseTable256(%rax), %eax 
andl    $255, %edx 
salq    $48, %rax 
orq     %rax, %rcx 
movzbl  BitReverseTable256(%rdx), %eax 
salq    $40, %rax 
orq     %rax, %rcx 
movq    %rcx, (%r13,%rsi) 
addq    $8, %rsi 
cmpq    $400000000, %rsi 
jne     .L3 

link|improve this answer

 

 

 

 

 

Comments