FIR on MMX Pentium
pmaddwd mm0, COEFaddr[edi] ; 4 multiplies, 2 adds
paddd mm7, mm2 ; accumulate intermed results
pmaddwd mm1, COEFaddr[edi+8] ; 4 multiplies, 2 adds
paddd mm7, mm3 ; accumulate intermed results
movq mm2, [esi+16] ; load four new samples
movq mm3, [esi+24] ; load four new samples
paddd mm7, mm0 ; accumulate intermed results
pmaddwd mm2, COEFaddr[edi+16] ; 4 multiplies, 2 adds
paddd mm7, mm1 ; accumulate intermed result
pmaddwd mm3, COEFaddr[edi+24] ; 4 multiplies, 2 adds
movq mm0, [esi+32] ; load four new samples
movq mm1, [esi+40] ; load four new samples
add edi, 32 ; update coefficient index
add esi, 32 ; update delay line pointer
dec ecx ; decrement loop count