FIR Filter on MMX Pentium
movq mm0, [esi] ; load four samples
pmaddwd mm0, COEFaddr[edi] ; 4 multiplies, 2 adds
/* two cycle stall happens here */
paddd mm7, mm0 ; accumulate intermed results
add edi, 8 ; update coefficient index
add esi, 8 ; update delay line pointer
dec ecx ; decrement loop count