/* 16-tap pipelined dot product for Pentium, equivalent to (but much faster * than) the C routine: * * float fir16(float a[16],float b[16]) * { * int i; * float sum = 0; * for(i=0;i<16;i++) * sum += a[i]*b[i]; * return sum; * } * * Copyright 1996 Phil Karn, KA9Q */ .globl _fir16 _fir16: pushl %ebx movl 8(%esp),%eax /* eax -> a[] */ movl 12(%esp),%ebx /* ebx -> b[] */ flds (%eax) fmuls (%ebx) /* a[0]*b[0] */ flds 4(%eax) fmuls 4(%ebx) /* a[1]*b[1] a[0]*b[0] */ flds 8(%eax) fmuls 8(%ebx) /* a[2]*b[2] a[1]*b[1] a[0]*b[0] */ fxch %st(2) /* a[0]*b[0] a[1]*b[1] a[2]*b[2] */ faddp /* sum0,1 a[2]*b[2] */ flds 12(%eax) fmuls 12(%ebx)/* a[3]*b[3] sum0,1 a[2]*b[2] */ fxch %st(2) /* a[2]*b[2] sum0,1 a[3]*b[3] */ faddp /* sum0,1,2 a[3]*b[3] */ flds 16(%eax) fmuls 16(%ebx) fxch %st(2) faddp flds 20(%eax) fmuls 20(%ebx) fxch %st(2) faddp flds 24(%eax) fmuls 24(%ebx) fxch %st(2) faddp flds 28(%eax) fmuls 28(%ebx) fxch %st(2) faddp flds 32(%eax) fmuls 32(%ebx) fxch %st(2) faddp flds 36(%eax) fmuls 36(%ebx) fxch %st(2) faddp flds 40(%eax) fmuls 40(%ebx) fxch %st(2) faddp flds 44(%eax) fmuls 44(%ebx) fxch %st(2) faddp flds 48(%eax) fmuls 48(%ebx) fxch %st(2) faddp flds 52(%eax) fmuls 52(%ebx) fxch %st(2) faddp flds 56(%eax) fmuls 56(%ebx) fxch %st(2) faddp flds 60(%eax) fmuls 60(%ebx) fxch %st(2) faddp /* sum a[15]*b[15] */ pop %ebx faddp /* sum left on stack */ ret