Open
Description
Bugzilla Link | 51246 |
Version | trunk |
OS | Windows NT |
CC | @alexey-bataev,@DMG862,@sdesmalen-arm,@rotateright |
Extended Description
define double @dot4f64(double* %ptrx, double* %ptry) {
%ptrx1 = getelementptr inbounds double, double* %ptrx, i64 1
%ptry1 = getelementptr inbounds double, double* %ptry, i64 1
%ptrx2 = getelementptr inbounds double, double* %ptrx, i64 2
%ptry2 = getelementptr inbounds double, double* %ptry, i64 2
%ptrx3 = getelementptr inbounds double, double* %ptrx, i64 3
%ptry3 = getelementptr inbounds double, double* %ptry, i64 3
%x0 = load double, double* %ptrx, align 8
%y0 = load double, double* %ptry, align 8
%x1 = load double, double* %ptrx1, align 8
%y1 = load double, double* %ptry1, align 8
%x2 = load double, double* %ptrx2, align 8
%y2 = load double, double* %ptry2, align 8
%x3 = load double, double* %ptrx3, align 8
%y3 = load double, double* %ptry3, align 8
%mul0 = fmul double %x0, %y0
%mul1 = fmul double %x1, %y1
%mul2 = fmul double %x2, %y2
%mul3 = fmul double %x3, %y3
%dot01 = fadd double %mul0, %mul1
%dot012 = fadd double %dot01, %mul2
%dot0123 = fadd double %dot012, %mul3
ret double %dot0123
}
At least on AVX targets, this (non-fast) dot product would still benefit from vectorizing the multiplication and then performing a sequential/unordered fadd reduction - but SLP currently limits reductions to associative ops.