I joined in a "competition" with some of the experts in the assembly forums over at experts-exchange to write the fastest assembly code Base64 encoder, following the guidelines of the
RFC3548.
Several versions emerged, some using lookup tables to precompute results, which faired well on CPUs which large caches. My version was more about opcode optimization and using bitwise arithmetic over decimal arithmetic, which achieved best results on CPUs that have better pipelining.
Code:
void ToBase64( BYTE* pSrc, char* pszOutBuf, int len )
{
char* chr_table="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
__asm{
mov ecx, len
mov esi, pSrc //;bytes from source
mov edi, chr_table
push ebp
mov ebp, pszOutBuf
src_byteLoop:
xor eax, eax
//;read 3 bytes
mov ah, byte ptr[esi]
mov al, byte ptr[esi+1]
shl eax, 16
mov ah, byte ptr[esi+2]
//;manipulate in edx bitset1
mov edx, eax
shl eax, 6 //;done first 6 bits
shr edx, 26
mov bl, byte ptr [edi+edx] //;put char in buffer
mov byte ptr[ebp], bl
inc ebp //;next buf
//;manipulate in edx bitset2
mov edx, eax
shl eax, 6 //;done first 6 bits
shr edx, 26
mov bl, byte ptr [edi+edx] //;put char in buffer
mov byte ptr[ebp], bl
inc ebp //;next buf
//;manipulate in edx bitset3
mov edx, eax
shl eax, 6 //;done first 6 bits
shr edx, 26
mov bl, byte ptr [edi+edx] //;put char in buffer
mov byte ptr[ebp], bl
inc ebp //;next buf
//;manipulate in edx bitset4
mov edx, eax
shl eax, 6 //;done first 6 bits
shr edx, 26
mov bl, byte ptr [edi+edx] //;put char in buffer
mov byte ptr[ebp], bl
inc ebp //;next buf
//;done these bytes
add esi, 3
sub ecx, 3
cmp ecx, 3
jge src_byteLoop //;still got src bytes
xor eax, eax //;set to zero (pad count)
cmp ecx, 0
jz finished
//;need to pad out some extra bytes
//;read in 3 bytes regardless of junk data following pSrc - already zero from above)
mov ah, byte ptr[esi]
mov al, byte ptr[esi+1]
shl eax, 16
mov ah, byte ptr[esi+2]
sub ecx, 3 //;bytes just read
neg ecx //;+ve inverse
mov edx, ecx //;save how many bytes need padding
//;as per the RFC, any padded bytes should be 0s
mov esi, 0xFFFFFF
lea ecx, dword ptr[ecx*8+8] //;calculate bitmask to shift
shl esi, cl
and eax, esi //;mask out the junk bytes
mov ecx, edx //;restore pad count
//;manipulate in edx byte 1
mov edx, eax
shl eax, 6 //;done first 6 bits
shr edx, 26
mov bl, byte ptr [edi+edx] //;put char in buffer
mov byte ptr[ebp], bl
inc ebp //;next buf
//;manipulate in edx byte 2
mov edx, eax
shl eax, 6 //;done first 6 bits
shr edx, 26
mov bl, byte ptr [edi+edx] //;put char in buffer
mov byte ptr[ebp], bl
inc ebp //;next buf
//;manipulate in edx byte 3
mov edx, eax
shl eax, 6 //;done first 6 bits
shr edx, 26
mov bl, byte ptr [edi+edx] //;put char in buffer
mov byte ptr[ebp], bl
inc ebp //;next buf
//;manipulate in edx byte 3
mov edx, eax
shl eax, 6 //;done first 6 bits
shr edx, 26
mov bl, byte ptr [edi+edx] //;put char in buffer
mov byte ptr[ebp], bl
inc ebp //;next buf
mov eax, ecx //;'return' pad count
finished:
test eax, eax
jz end
//;some bytes were padding, put them as =
sub ebp, eax //;move ptr back for num bytes to pad
padChars:
mov byte ptr[ebp], 0x3d //;=
inc ebp
dec eax
jnz padChars
end:
pop ebp
}
}
There were several key points to my optimization technique:
- Firstly unrolling of loops, which basically means eliminating any kind of loop structures in the code, in favour of manually typing the code to perform the operations on the data. In terms of asm optimizations this reduces branching and shifting EIP too often.
- Another key point, was minimizing the load/store operations performed. To keep these down, you can use indirect addressing to get to the byte in memory you are interested in.e.g.
mov edx, base
add edx, edi
mov bl, byte ptr[edx]
becomes
mov bl, [edx+edi]
- As Base64 encoding requires you to work at bit level using 3 bytes at a time, you can speed up the conversion of the bit sets by using the binary arithmetic operations of SHL and SHR (shift left/right) to progressively take the most significant bits of one DWORD and operate on them with a lower significance by shifting them.For example, if EAX contains a DWORD and you copy that the EDX. You can get rid of the left most 6 bits of EAX by shifting left 6. You can deal with those 6 bits in EDX, by shifting EDX right 26 times. This is much quicker than shifting out one bit at a time.
My final throughput on the official test machine was 196.92 MB/s, which is almost double the throughput of the CryptoAPI function, which came in at 104.05 MB/s