FMA3 slower on Ryzen 3 / Zen 2?
Posted: 2020-07-16, 22:47:40
The following disassembly snippets come from a 2 channel biquad filter function. The first snippet, which takes advantage of FMA3 SSE instructions takes nearly twice the execution time as the second snippet which simply uses mulps and add/subps when processing 2048 blocks of floats. I'm running a Ryzen 3900x. Any thoughts on this at all? Could there possibly be a warmup time required before FMA3 instructions can be used with their advertised speed advantages? I know you haven't tested Zen 2 yet, and I'd be happy to help, but my machine is running Windows.
(Using FMA3)
00007FFEEC766E30 sub rsp,48h
00007FFEEC766E34 vbroadcastss xmm5,dword ptr [rcx+3B8h]
00007FFEEC766E3D vmovsd xmm3,qword ptr [rcx+5D8h]
00007FFEEC766E45 vmovsd xmm4,qword ptr [rcx+658h]
00007FFEEC766E4D vmovaps xmmword ptr [rsp+30h],xmm6
00007FFEEC766E53 lea rax,[rdx+r8*8]
00007FFEEC766E57 vmovaps xmmword ptr [rsp+20h],xmm7
00007FFEEC766E5D vmovaps xmmword ptr [rsp+10h],xmm8
00007FFEEC766E63 vmovaps xmmword ptr [rsp],xmm9
00007FFEEC766E68 vbroadcastss xmm9,dword ptr [rcx+3C0h]
00007FFEEC766E71 vbroadcastss xmm6,dword ptr [rcx+3B0h]
00007FFEEC766E7A vbroadcastss xmm8,dword ptr [rcx+3B4h]
00007FFEEC766E83 vbroadcastss xmm7,dword ptr [rcx+3BCh]
00007FFEEC766E8C cmp rdx,rax
00007FFEEC766E8F jae ProcessBiquadVec4+92h (07FFEEC766EC2h)
00007FFEEC766E91 vmovsd xmm1,qword ptr [rdx]
00007FFEEC766E95 vfmadd231ps xmm3,xmm1,xmm6
00007FFEEC766E9A vmovups xmm0,xmm3
00007FFEEC766E9E vmovups xmm3,xmm4
00007FFEEC766EA2 vmovlps qword ptr [rdx],xmm0
00007FFEEC766EA6 add rdx,8
00007FFEEC766EAA vfnmadd231ps xmm3,xmm7,xmm0
00007FFEEC766EAF vmulps xmm4,xmm1,xmm5
00007FFEEC766EB3 vfmadd231ps xmm3,xmm1,xmm8
00007FFEEC766EB8 vfnmadd231ps xmm4,xmm9,xmm0
00007FFEEC766EBD cmp rdx,rax
00007FFEEC766EC0 jb ProcessBiquadVec4+61h (07FFEEC766E91h)
00007FFEEC766EC2 vmovaps xmm6,xmmword ptr [rsp+30h]
00007FFEEC766EC8 vmovaps xmm7,xmmword ptr [rsp+20h]
00007FFEEC766ECE vmovaps xmm8,xmmword ptr [rsp+10h]
00007FFEEC766ED4 vmovaps xmm9,xmmword ptr [rsp]
00007FFEEC766ED9 vmovlps qword ptr [rcx+5D8h],xmm3
00007FFEEC766EE1 vmovlps qword ptr [rcx+658h],xmm4
00007FFEEC766EE9 add rsp,48h
00007FFEEC766EED ret
(Not using FMA3)
00007FFEEC766E30 sub rsp,68h
00007FFEEC766E34 vmovsd xmm5,qword ptr [rcx+5D8h]
00007FFEEC766E3C vmovaps xmmword ptr [rsp+50h],xmm6
00007FFEEC766E42 lea rax,[rdx+r8*8]
00007FFEEC766E46 vmovaps xmmword ptr [rsp+40h],xmm7
00007FFEEC766E4C vmovaps xmmword ptr [rsp+30h],xmm8
00007FFEEC766E52 vmovaps xmmword ptr [rsp+20h],xmm9
00007FFEEC766E58 vmovaps xmmword ptr [rsp+10h],xmm10
00007FFEEC766E5E vmovaps xmmword ptr [rsp],xmm11
00007FFEEC766E63 vbroadcastss xmm11,dword ptr [rcx+3C0h]
00007FFEEC766E6C vbroadcastss xmm7,dword ptr [rcx+3B0h]
00007FFEEC766E75 vbroadcastss xmm8,dword ptr [rcx+3B4h]
00007FFEEC766E7E vbroadcastss xmm9,dword ptr [rcx+3B8h]
00007FFEEC766E87 vbroadcastss xmm10,dword ptr [rcx+3BCh]
00007FFEEC766E90 vmovsd xmm6,qword ptr [rcx+658h]
00007FFEEC766E98 cmp rdx,rax
00007FFEEC766E9B jae ProcessBiquadVec4+0A9h (07FFEEC766ED9h)
00007FFEEC766E9D nop dword ptr [rax]
00007FFEEC766EA0 vmovsd xmm3,qword ptr [rdx]
00007FFEEC766EA4 vmulps xmm2,xmm3,xmm7
00007FFEEC766EA8 vaddps xmm4,xmm2,xmm5
00007FFEEC766EAC vmulps xmm0,xmm3,xmm8
00007FFEEC766EB1 vaddps xmm2,xmm0,xmm6
00007FFEEC766EB5 vmovlps qword ptr [rdx],xmm4
00007FFEEC766EB9 add rdx,8
00007FFEEC766EBD vmulps xmm1,xmm4,xmm10
00007FFEEC766EC2 vmulps xmm3,xmm3,xmm9
00007FFEEC766EC7 vmulps xmm0,xmm4,xmm11
00007FFEEC766ECC vsubps xmm5,xmm2,xmm1
00007FFEEC766ED0 vsubps xmm6,xmm3,xmm0
00007FFEEC766ED4 cmp rdx,rax
00007FFEEC766ED7 jb ProcessBiquadVec4+70h (07FFEEC766EA0h)
00007FFEEC766ED9 vmovaps xmm7,xmmword ptr [rsp+40h]
00007FFEEC766EDF vmovaps xmm8,xmmword ptr [rsp+30h]
00007FFEEC766EE5 vmovaps xmm9,xmmword ptr [rsp+20h]
00007FFEEC766EEB vmovaps xmm10,xmmword ptr [rsp+10h]
00007FFEEC766EF1 vmovaps xmm11,xmmword ptr [rsp]
00007FFEEC766EF6 vmovlps qword ptr [rcx+5D8h],xmm5
00007FFEEC766EFE vmovlps qword ptr [rcx+658h],xmm6
00007FFEEC766F06 vmovaps xmm6,xmmword ptr [rsp+50h]
00007FFEEC766F0C add rsp,68h
00007FFEEC766F10 ret
(Using FMA3)
00007FFEEC766E30 sub rsp,48h
00007FFEEC766E34 vbroadcastss xmm5,dword ptr [rcx+3B8h]
00007FFEEC766E3D vmovsd xmm3,qword ptr [rcx+5D8h]
00007FFEEC766E45 vmovsd xmm4,qword ptr [rcx+658h]
00007FFEEC766E4D vmovaps xmmword ptr [rsp+30h],xmm6
00007FFEEC766E53 lea rax,[rdx+r8*8]
00007FFEEC766E57 vmovaps xmmword ptr [rsp+20h],xmm7
00007FFEEC766E5D vmovaps xmmword ptr [rsp+10h],xmm8
00007FFEEC766E63 vmovaps xmmword ptr [rsp],xmm9
00007FFEEC766E68 vbroadcastss xmm9,dword ptr [rcx+3C0h]
00007FFEEC766E71 vbroadcastss xmm6,dword ptr [rcx+3B0h]
00007FFEEC766E7A vbroadcastss xmm8,dword ptr [rcx+3B4h]
00007FFEEC766E83 vbroadcastss xmm7,dword ptr [rcx+3BCh]
00007FFEEC766E8C cmp rdx,rax
00007FFEEC766E8F jae ProcessBiquadVec4+92h (07FFEEC766EC2h)
00007FFEEC766E91 vmovsd xmm1,qword ptr [rdx]
00007FFEEC766E95 vfmadd231ps xmm3,xmm1,xmm6
00007FFEEC766E9A vmovups xmm0,xmm3
00007FFEEC766E9E vmovups xmm3,xmm4
00007FFEEC766EA2 vmovlps qword ptr [rdx],xmm0
00007FFEEC766EA6 add rdx,8
00007FFEEC766EAA vfnmadd231ps xmm3,xmm7,xmm0
00007FFEEC766EAF vmulps xmm4,xmm1,xmm5
00007FFEEC766EB3 vfmadd231ps xmm3,xmm1,xmm8
00007FFEEC766EB8 vfnmadd231ps xmm4,xmm9,xmm0
00007FFEEC766EBD cmp rdx,rax
00007FFEEC766EC0 jb ProcessBiquadVec4+61h (07FFEEC766E91h)
00007FFEEC766EC2 vmovaps xmm6,xmmword ptr [rsp+30h]
00007FFEEC766EC8 vmovaps xmm7,xmmword ptr [rsp+20h]
00007FFEEC766ECE vmovaps xmm8,xmmword ptr [rsp+10h]
00007FFEEC766ED4 vmovaps xmm9,xmmword ptr [rsp]
00007FFEEC766ED9 vmovlps qword ptr [rcx+5D8h],xmm3
00007FFEEC766EE1 vmovlps qword ptr [rcx+658h],xmm4
00007FFEEC766EE9 add rsp,48h
00007FFEEC766EED ret
(Not using FMA3)
00007FFEEC766E30 sub rsp,68h
00007FFEEC766E34 vmovsd xmm5,qword ptr [rcx+5D8h]
00007FFEEC766E3C vmovaps xmmword ptr [rsp+50h],xmm6
00007FFEEC766E42 lea rax,[rdx+r8*8]
00007FFEEC766E46 vmovaps xmmword ptr [rsp+40h],xmm7
00007FFEEC766E4C vmovaps xmmword ptr [rsp+30h],xmm8
00007FFEEC766E52 vmovaps xmmword ptr [rsp+20h],xmm9
00007FFEEC766E58 vmovaps xmmword ptr [rsp+10h],xmm10
00007FFEEC766E5E vmovaps xmmword ptr [rsp],xmm11
00007FFEEC766E63 vbroadcastss xmm11,dword ptr [rcx+3C0h]
00007FFEEC766E6C vbroadcastss xmm7,dword ptr [rcx+3B0h]
00007FFEEC766E75 vbroadcastss xmm8,dword ptr [rcx+3B4h]
00007FFEEC766E7E vbroadcastss xmm9,dword ptr [rcx+3B8h]
00007FFEEC766E87 vbroadcastss xmm10,dword ptr [rcx+3BCh]
00007FFEEC766E90 vmovsd xmm6,qword ptr [rcx+658h]
00007FFEEC766E98 cmp rdx,rax
00007FFEEC766E9B jae ProcessBiquadVec4+0A9h (07FFEEC766ED9h)
00007FFEEC766E9D nop dword ptr [rax]
00007FFEEC766EA0 vmovsd xmm3,qword ptr [rdx]
00007FFEEC766EA4 vmulps xmm2,xmm3,xmm7
00007FFEEC766EA8 vaddps xmm4,xmm2,xmm5
00007FFEEC766EAC vmulps xmm0,xmm3,xmm8
00007FFEEC766EB1 vaddps xmm2,xmm0,xmm6
00007FFEEC766EB5 vmovlps qword ptr [rdx],xmm4
00007FFEEC766EB9 add rdx,8
00007FFEEC766EBD vmulps xmm1,xmm4,xmm10
00007FFEEC766EC2 vmulps xmm3,xmm3,xmm9
00007FFEEC766EC7 vmulps xmm0,xmm4,xmm11
00007FFEEC766ECC vsubps xmm5,xmm2,xmm1
00007FFEEC766ED0 vsubps xmm6,xmm3,xmm0
00007FFEEC766ED4 cmp rdx,rax
00007FFEEC766ED7 jb ProcessBiquadVec4+70h (07FFEEC766EA0h)
00007FFEEC766ED9 vmovaps xmm7,xmmword ptr [rsp+40h]
00007FFEEC766EDF vmovaps xmm8,xmmword ptr [rsp+30h]
00007FFEEC766EE5 vmovaps xmm9,xmmword ptr [rsp+20h]
00007FFEEC766EEB vmovaps xmm10,xmmword ptr [rsp+10h]
00007FFEEC766EF1 vmovaps xmm11,xmmword ptr [rsp]
00007FFEEC766EF6 vmovlps qword ptr [rcx+5D8h],xmm5
00007FFEEC766EFE vmovlps qword ptr [rcx+658h],xmm6
00007FFEEC766F06 vmovaps xmm6,xmmword ptr [rsp+50h]
00007FFEEC766F0C add rsp,68h
00007FFEEC766F10 ret