x86 Assembly Base64 Encoder

I joined in a "competition" with some of the experts in the assembly forums over at experts-exchange to write the fastest assembly code Base64 encoder, following the guidelines of the RFC3548. Several versions emerged, some using lookup tables to precompute results, which faired well on CPUs which large caches. My version was more about opcode optimization and using bitwise arithmetic over decimal arithmetic, which achieved best results on CPUs that have better pipelining. Code:

  void ToBase64( BYTE* pSrc, char* pszOutBuf, int len )
{
      char* chr_table="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

       __asm{
                  mov ecx, len
                  mov esi, pSrc                              //;bytes from source
                  mov edi, chr_table
                  push ebp
                  mov ebp, pszOutBuf

src_byteLoop:

                  xor eax, eax

                  //;read 3 bytes
                  mov ah, byte ptr[esi]
                  mov al, byte ptr[esi+1]
                  shl eax, 16
                  mov ah, byte ptr[esi+2]

                  //;manipulate in edx bitset1
                  mov edx, eax
                  shl eax, 6                                    //;done first 6 bits

                  shr edx, 26            
                  mov bl, byte ptr [edi+edx]            //;put char in buffer
                  mov byte ptr[ebp], bl
                  inc ebp                                          //;next buf

                  //;manipulate in edx bitset2
                  mov edx, eax
                  shl eax, 6                                    //;done first 6 bits

                  shr edx, 26
                  mov bl, byte ptr [edi+edx]            //;put char in buffer
                  mov byte ptr[ebp], bl
                  inc ebp                                          //;next buf

                  //;manipulate in edx bitset3
                  mov edx, eax
                  shl eax, 6                                    //;done first 6 bits

                  shr edx, 26
                  mov bl, byte ptr [edi+edx]            //;put char in buffer
                  mov byte ptr[ebp], bl
                  inc ebp                                          //;next buf

                  //;manipulate in edx bitset4
                  mov edx, eax
                  shl eax, 6                                    //;done first 6 bits

                  shr edx, 26
                  mov bl, byte ptr [edi+edx]            //;put char in buffer
                  mov byte ptr[ebp], bl
                  inc ebp                                          //;next buf

                  //;done these bytes
                  add esi, 3
                  sub ecx, 3

                  cmp ecx, 3
                  jge src_byteLoop                        //;still got src bytes

                  xor eax, eax                              //;set to zero (pad count)
                  cmp ecx, 0
                  jz finished

                        //;need to pad out some extra bytes

                        //;read in 3 bytes regardless of junk data following pSrc - already zero from above)
                        mov ah, byte ptr[esi]
                        mov al, byte ptr[esi+1]
                        shl eax, 16
                        mov ah, byte ptr[esi+2]

                        sub ecx, 3                                    //;bytes just read
                        neg ecx                                          //;+ve inverse
                        mov edx, ecx                              //;save how many bytes need padding

                        //;as per the RFC, any padded bytes should be 0s
                        mov esi, 0xFFFFFF
                        lea ecx, dword ptr[ecx*8+8]            //;calculate bitmask to shift
                        shl esi, cl
                        and eax, esi                              //;mask out the junk bytes

                        mov ecx, edx                              //;restore pad count

                        //;manipulate in edx byte 1
                        mov edx, eax
                        shl eax, 6                                    //;done first 6 bits                        

                        shr edx, 26
                        mov bl, byte ptr [edi+edx]            //;put char in buffer
                        mov byte ptr[ebp], bl
                        inc ebp                                          //;next buf

                        //;manipulate in edx byte 2
                        mov edx, eax
                        shl eax, 6                                    //;done first 6 bits                        

                        shr edx, 26
                        mov bl, byte ptr [edi+edx]            //;put char in buffer
                        mov byte ptr[ebp], bl
                        inc ebp                                          //;next buf

                        //;manipulate in edx byte 3
                        mov edx, eax
                        shl eax, 6                                    //;done first 6 bits                        

                        shr edx, 26
                        mov bl, byte ptr [edi+edx]            //;put char in buffer
                        mov byte ptr[ebp], bl
                        inc ebp                                          //;next buf

                        //;manipulate in edx byte 3
                        mov edx, eax
                        shl eax, 6                                    //;done first 6 bits                        

                        shr edx, 26
                        mov bl, byte ptr [edi+edx]                  //;put char in buffer
                        mov byte ptr[ebp], bl
                        inc ebp                                          //;next buf

                        mov eax, ecx                              //;'return' pad count

finished:
                  test eax, eax
                  jz end
                  //;some bytes were padding, put them as =

                        sub ebp, eax                        //;move ptr back for num bytes to pad
padChars:
                        mov byte ptr[ebp], 0x3d            //;=
                        inc ebp
                        dec eax

                        jnz padChars

end:
                  pop ebp
        }      
}

There were several key points to my optimization technique:

Firstly unrolling of loops, which basically means eliminating any kind of loop structures in the code, in favour of manually typing the code to perform the operations on the data. In terms of asm optimizations this reduces branching and shifting EIP too often.
Another key point, was minimizing the load/store operations performed. To keep these down, you can use indirect addressing to get to the byte in memory you are interested in.e.g. mov edx, base add edx, edi mov bl, byte ptr[edx] becomes mov bl, [edx+edi]
As Base64 encoding requires you to work at bit level using 3 bytes at a time, you can speed up the conversion of the bit sets by using the binary arithmetic operations of SHL and SHR (shift left/right) to progressively take the most significant bits of one DWORD and operate on them with a lower significance by shifting them.For example, if EAX contains a DWORD and you copy that the EDX. You can get rid of the left most 6 bits of EAX by shifting left 6. You can deal with those 6 bits in EDX, by shifting EDX right 26 times. This is much quicker than shifting out one bit at a time.

My final throughput on the official test machine was 196.92 MB/s, which is almost double the throughput of the CryptoAPI function, which came in at 104.05 MB/s