-30 (-7.18 % of base) - System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int
; Assembly listing for method System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 19 single block inlinees; 5 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 15, 12 ) byref -> rdi single-def
; V01 arg1 [V01,T06] ( 6, 3.75) short -> rsi single-def
; V02 arg2 [V02,T01] ( 15, 19.50) int -> rdx
; V03 loc0 [V03,T03] ( 6, 13.50) long -> rcx
; V04 loc1 [V04,T00] ( 10, 22.50) byref -> rax
;* V05 loc2 [V05 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V06 loc3 [V06 ] ( 0, 0 ) byref -> zero-ref
;* V07 loc4 [V07 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V08 loc5 [V08 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V09 loc6 [V09 ] ( 0, 0 ) byref -> zero-ref
;* V10 loc7 [V10 ] ( 0, 0 ) byref -> zero-ref
;* V11 loc8 [V11 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V12 loc9 [V12 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V13 loc10 [V13,T17] ( 3, 5 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V14 loc11 [V14,T07] ( 2, 4.50) byref -> rcx single-def
;* V15 loc12 [V15 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
;* V16 loc13 [V16 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V17 loc14 [V17,T16] ( 4, 12.50) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V18 loc15 [V18,T10] ( 5, 2.50) byref -> rcx single-def
;* V19 loc16 [V19 ] ( 0, 0 ) byref -> zero-ref single-def
;* V20 loc17 [V20 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
;* V21 loc18 [V21 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V22 loc19 [V22,T18] ( 4, 2 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V23 loc20 [V23,T20] ( 2, 1 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V24 loc21 [V24 ] ( 0, 0 ) byref -> zero-ref
;* V25 loc22 [V25 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;* V26 loc23 [V26 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V27 loc24 [V27 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V28 loc25 [V28,T11] ( 5, 2.50) byref -> rax single-def
;* V29 loc26 [V29 ] ( 0, 0 ) byref -> zero-ref single-def
;* V30 loc27 [V30 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;* V31 loc28 [V31 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V32 loc29 [V32,T19] ( 4, 2 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;# V33 OutArgs [V33 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V34 tmp1 [V34,T14] ( 3, 1.50) byref -> rcx
;* V35 tmp2 [V35 ] ( 0, 0 ) simd16 -> zero-ref "impAppendStmt"
;* V36 tmp3 [V36 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
; V37 tmp4 [V37,T15] ( 3, 1.50) byref -> rax
;* V38 tmp5 [V38 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V39 tmp6 [V39 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V40 tmp7 [V40 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V41 tmp8 [V41 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V42 tmp9 [V42 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V43 tmp10 [V43 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V44 tmp11 [V44 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V45 tmp12 [V45 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V46 tmp13 [V46 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V47 tmp14 [V47 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V48 tmp15 [V48 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V49 tmp16 [V49 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V50 tmp17 [V50 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V51 tmp18 [V51 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V52 tmp19 [V52 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V53 tmp20 [V53 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V54 tmp21 [V54 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V55 tmp22 [V55 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V56 tmp23 [V56,T12] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
; V57 tmp24 [V57,T08] ( 3, 3 ) byref -> rax "Inlining Arg"
;* V58 tmp25 [V58 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V59 tmp26 [V59,T13] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
; V60 tmp27 [V60,T09] ( 3, 3 ) byref -> rcx "Inlining Arg"
; V61 tmp28 [V61,T04] ( 7, 7 ) int -> rax "Single return block return value"
;* V62 tmp29 [V62 ] ( 0, 0 ) simd32 -> zero-ref "field V05._lower (fldOffset=0x0)" P-INDEP
;* V63 tmp30 [V63 ] ( 0, 0 ) simd32 -> zero-ref "field V05._upper (fldOffset=0x20)" P-INDEP
;* V64 tmp31 [V64 ] ( 0, 0 ) simd32 -> zero-ref "field V07._lower (fldOffset=0x0)" P-INDEP
;* V65 tmp32 [V65 ] ( 0, 0 ) simd32 -> zero-ref "field V07._upper (fldOffset=0x20)" P-INDEP
;* V66 tmp33 [V66 ] ( 0, 0 ) simd32 -> zero-ref "field V08._lower (fldOffset=0x0)" P-INDEP
;* V67 tmp34 [V67 ] ( 0, 0 ) simd32 -> zero-ref "field V08._upper (fldOffset=0x20)" P-INDEP
;* V68 tmp35 [V68 ] ( 0, 0 ) simd32 -> zero-ref "field V11._lower (fldOffset=0x0)" P-INDEP
;* V69 tmp36 [V69 ] ( 0, 0 ) simd32 -> zero-ref "field V11._upper (fldOffset=0x20)" P-INDEP
;* V70 tmp37 [V70 ] ( 0, 0 ) simd32 -> zero-ref "field V12._lower (fldOffset=0x0)" P-INDEP
;* V71 tmp38 [V71 ] ( 0, 0 ) simd32 -> zero-ref "field V12._upper (fldOffset=0x20)" P-INDEP
; V72 cse0 [V72,T05] ( 7, 6.75) int -> rax hoist multi-def "CSE #01: aggressive"
;
; Lcl frame size = 0
G_M26041_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M26041_IG02:
cmp edx, 8
jl G_M26041_IG10
;; size=9 bbWeight=1 PerfScore 1.25
G_M26041_IG03:
mov rax, rdi
cmp edx, 16
jg SHORT G_M26041_IG05
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb xmm0, xmm0
add edx, -8
- movsxd rax, edx
- lea rax, bword ptr [rdi+2*rax]
+ movsxd rsi, edx
+ lea rax, bword ptr [rdi+2*rsi]
cmp rdi, rax
mov rcx, rdi
cmova rcx, rax
vmovups xmm1, xmmword ptr [rcx]
vpackuswb xmm1, xmm1, xmmword ptr [rax]
vpcmpeqb xmm0, xmm1, xmm0
vptest xmm0, xmm0
je G_M26041_IG14
vpmovmskb edx, xmm0
tzcnt edx, edx
cmp edx, 8
jl SHORT G_M26041_IG04
mov rcx, rax
add edx, -8
- ;; size=83 bbWeight=0.50 PerfScore 11.88
+ ;; size=79 bbWeight=0.50 PerfScore 11.75
G_M26041_IG04:
sub rcx, rdi
shr rcx, 1
lea eax, [rcx+rdx]
jmp G_M26041_IG17
- align [6 bytes for IG06]
- ;; size=20 bbWeight=0.50 PerfScore 1.62
+ align [0 bytes for IG06]
+ ;; size=14 bbWeight=0.50 PerfScore 1.62
G_M26041_IG05:
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb ymm0, ymm0
cmp edx, 32
jle SHORT G_M26041_IG07
- lea ecx, [rdx-0x20]
- movsxd rcx, ecx
+ lea esi, [rdx-0x20]
+ movsxd rcx, esi
lea rcx, bword ptr [rax+2*rcx]
- ;; size=28 bbWeight=0.50 PerfScore 3.38
+ ;; size=24 bbWeight=0.50 PerfScore 3.25
G_M26041_IG06:
vmovups ymm1, ymmword ptr [rax]
vpackuswb ymm1, ymm1, ymmword ptr [rax+0x20]
vpcmpeqb ymm1, ymm1, ymm0
vptest ymm1, ymm1
jne SHORT G_M26041_IG09
add rax, 64
cmp rax, rcx
jb SHORT G_M26041_IG06
;; size=29 bbWeight=4 PerfScore 64.00
G_M26041_IG07:
add edx, -16
movsxd rcx, edx
lea rcx, bword ptr [rdi+2*rcx]
cmp rax, rcx
cmova rax, rcx
vmovups ymm1, ymmword ptr [rax]
vpackuswb ymm1, ymm1, ymmword ptr [rcx]
vpcmpeqb ymm0, ymm1, ymm0
vptest ymm0, ymm0
je G_M26041_IG14
vpermq ymm0, ymm0, -40
vpmovmskb edx, ymm0
tzcnt edx, edx
cmp edx, 16
jl SHORT G_M26041_IG08
mov rax, rcx
add edx, -16
;; size=65 bbWeight=0.50 PerfScore 12.38
G_M26041_IG08:
sub rax, rdi
shr rax, 1
add eax, edx
- jmp G_M26041_IG17
- align [13 bytes for IG13]
- ;; size=26 bbWeight=0.50 PerfScore 1.50
+ jmp SHORT G_M26041_IG17
+ align [0 bytes for IG13]
+ ;; size=10 bbWeight=0.50 PerfScore 1.50
G_M26041_IG09:
sub rax, rdi
shr rax, 1
vpermq ymm0, ymm1, -40
vpmovmskb edi, ymm0
xor ecx, ecx
tzcnt ecx, edi
add eax, ecx
jmp SHORT G_M26041_IG17
;; size=26 bbWeight=0.50 PerfScore 5.12
G_M26041_IG10:
xor ecx, ecx
cmp edx, 4
jl SHORT G_M26041_IG11
add edx, -4
movsx rcx, word ptr [rdi]
movsx rax, si
cmp ecx, eax
je SHORT G_M26041_IG22
movsx rcx, word ptr [rdi+0x02]
cmp ecx, eax
je SHORT G_M26041_IG20
movsx rcx, word ptr [rdi+0x04]
cmp ecx, eax
je SHORT G_M26041_IG18
movsx rcx, word ptr [rdi+0x06]
cmp ecx, eax
je SHORT G_M26041_IG16
mov ecx, 4
;; size=54 bbWeight=0.50 PerfScore 11.62
G_M26041_IG11:
test edx, edx
jle SHORT G_M26041_IG14
;; size=4 bbWeight=0.50 PerfScore 0.62
G_M26041_IG12:
movsx rax, si
;; size=4 bbWeight=0.25 PerfScore 0.06
G_M26041_IG13:
dec edx
movsx rsi, word ptr [rdi+2*rcx]
cmp esi, eax
je SHORT G_M26041_IG15
inc rcx
test edx, edx
jg SHORT G_M26041_IG13
;; size=18 bbWeight=4 PerfScore 28.00
G_M26041_IG14:
mov eax, -1
jmp SHORT G_M26041_IG17
;; size=7 bbWeight=0.50 PerfScore 1.12
G_M26041_IG15:
mov eax, ecx
jmp SHORT G_M26041_IG17
;; size=4 bbWeight=0.50 PerfScore 1.12
G_M26041_IG16:
mov eax, 3
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M26041_IG17:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M26041_IG18:
mov eax, 2
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M26041_IG19:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M26041_IG20:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M26041_IG21:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M26041_IG22:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M26041_IG23:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
-; Total bytes of code 418, prolog size 4, PerfScore 150.44, instruction count 126, allocated bytes for code 418 (MethodHash=25ef9a46) for method System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int (FullOpts)
+; Total bytes of code 388, prolog size 4, PerfScore 150.19, instruction count 124, allocated bytes for code 391 (MethodHash=25ef9a46) for method System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int (FullOpts)
; ============================================================
-15 (-2.19 % of base) - System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int
; Assembly listing for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 11 single block inlinees; 5 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T03] ( 15, 12 ) byref -> rdi single-def
; V01 arg1 [V01,T10] ( 6, 3.75) short -> rsi single-def
; V02 arg2 [V02,T05] ( 9, 7 ) short -> rdx single-def
; V03 arg3 [V03,T06] ( 9, 7 ) short -> rcx single-def
; V04 arg4 [V04,T01] ( 15, 19.50) int -> r8
; V05 loc0 [V05,T04] ( 6, 13.50) long -> r9
; V06 loc1 [V06,T02] ( 20, 20 ) short -> rax
; V07 loc2 [V07,T00] ( 10, 22.50) byref -> rax
;* V08 loc3 [V08 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V09 loc4 [V09 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V10 loc5 [V10 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V11 loc6 [V11 ] ( 0, 0 ) byref -> zero-ref
;* V12 loc7 [V12 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V13 loc8 [V13 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V14 loc9 [V14 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V15 loc10 [V15 ] ( 0, 0 ) byref -> zero-ref
;* V16 loc11 [V16 ] ( 0, 0 ) byref -> zero-ref
;* V17 loc12 [V17 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V18 loc13 [V18 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V19 loc14 [V19 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V20 loc15 [V20,T26] ( 3, 5 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V21 loc16 [V21,T27] ( 3, 5 ) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V22 loc17 [V22,T28] ( 3, 5 ) simd32 -> mm2 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V23 loc18 [V23,T11] ( 2, 4.50) byref -> rcx single-def
;* V24 loc19 [V24 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
; V25 loc20 [V25,T24] ( 4, 16 ) simd32 -> mm3 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V26 loc21 [V26,T25] ( 4, 12.50) simd32 -> mm3 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V27 loc22 [V27,T14] ( 5, 2.50) byref -> rcx single-def
;* V28 loc23 [V28 ] ( 0, 0 ) byref -> zero-ref single-def
;* V29 loc24 [V29 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
; V30 loc25 [V30,T29] ( 4, 2 ) simd32 -> mm3 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V31 loc26 [V31,T30] ( 4, 2 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V32 loc27 [V32,T33] ( 2, 1 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V33 loc28 [V33,T34] ( 2, 1 ) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V34 loc29 [V34,T35] ( 2, 1 ) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V35 loc30 [V35 ] ( 0, 0 ) byref -> zero-ref
;* V36 loc31 [V36 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;* V37 loc32 [V37 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V38 loc33 [V38 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V39 loc34 [V39,T15] ( 5, 2.50) byref -> rax single-def
;* V40 loc35 [V40 ] ( 0, 0 ) byref -> zero-ref single-def
;* V41 loc36 [V41 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
; V42 loc37 [V42,T31] ( 4, 2 ) simd16 -> mm3 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V43 loc38 [V43,T32] ( 4, 2 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;# V44 OutArgs [V44 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V45 tmp1 [V45,T18] ( 3, 1.50) byref -> rcx
+; V45 tmp1 [V45,T18] ( 3, 1.50) byref -> rdx
;* V46 tmp2 [V46 ] ( 0, 0 ) simd16 -> zero-ref "impAppendStmt"
; V47 tmp3 [V47,T19] ( 3, 1.50) byref -> rax
;* V48 tmp4 [V48 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V49 tmp5 [V49 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
; V50 tmp6 [V50,T09] ( 2, 4 ) int -> rax
; V51 tmp7 [V51,T20] ( 2, 1 ) int -> rax
; V52 tmp8 [V52,T21] ( 2, 1 ) int -> rax
; V53 tmp9 [V53,T22] ( 2, 1 ) int -> rax
; V54 tmp10 [V54,T23] ( 2, 1 ) int -> rax
;* V55 tmp11 [V55 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V56 tmp12 [V56 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V57 tmp13 [V57 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V58 tmp14 [V58 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V59 tmp15 [V59 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V60 tmp16 [V60 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V61 tmp17 [V61 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V62 tmp18 [V62 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V63 tmp19 [V63 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V64 tmp20 [V64,T16] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
; V65 tmp21 [V65,T12] ( 3, 3 ) byref -> rax "Inlining Arg"
;* V66 tmp22 [V66 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V67 tmp23 [V67,T17] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
-; V68 tmp24 [V68,T13] ( 3, 3 ) byref -> rcx "Inlining Arg"
+; V67 tmp23 [V67,T17] ( 5, 2.50) int -> rcx "Inline stloc first use temp"
+; V68 tmp24 [V68,T13] ( 3, 3 ) byref -> rdx "Inlining Arg"
; V69 tmp25 [V69,T07] ( 7, 7 ) int -> rax "Single return block return value"
;* V70 tmp26 [V70 ] ( 0, 0 ) simd32 -> zero-ref "field V08._lower (fldOffset=0x0)" P-INDEP
;* V71 tmp27 [V71 ] ( 0, 0 ) simd32 -> zero-ref "field V08._upper (fldOffset=0x20)" P-INDEP
;* V72 tmp28 [V72 ] ( 0, 0 ) simd32 -> zero-ref "field V09._lower (fldOffset=0x0)" P-INDEP
;* V73 tmp29 [V73 ] ( 0, 0 ) simd32 -> zero-ref "field V09._upper (fldOffset=0x20)" P-INDEP
;* V74 tmp30 [V74 ] ( 0, 0 ) simd32 -> zero-ref "field V10._lower (fldOffset=0x0)" P-INDEP
;* V75 tmp31 [V75 ] ( 0, 0 ) simd32 -> zero-ref "field V10._upper (fldOffset=0x20)" P-INDEP
;* V76 tmp32 [V76 ] ( 0, 0 ) simd32 -> zero-ref "field V12._lower (fldOffset=0x0)" P-INDEP
;* V77 tmp33 [V77 ] ( 0, 0 ) simd32 -> zero-ref "field V12._upper (fldOffset=0x20)" P-INDEP
;* V78 tmp34 [V78 ] ( 0, 0 ) simd32 -> zero-ref "field V13._lower (fldOffset=0x0)" P-INDEP
;* V79 tmp35 [V79 ] ( 0, 0 ) simd32 -> zero-ref "field V13._upper (fldOffset=0x20)" P-INDEP
;* V80 tmp36 [V80 ] ( 0, 0 ) simd32 -> zero-ref "field V14._lower (fldOffset=0x0)" P-INDEP
;* V81 tmp37 [V81 ] ( 0, 0 ) simd32 -> zero-ref "field V14._upper (fldOffset=0x20)" P-INDEP
;* V82 tmp38 [V82 ] ( 0, 0 ) simd32 -> zero-ref "field V17._lower (fldOffset=0x0)" P-INDEP
;* V83 tmp39 [V83 ] ( 0, 0 ) simd32 -> zero-ref "field V17._upper (fldOffset=0x20)" P-INDEP
;* V84 tmp40 [V84 ] ( 0, 0 ) simd32 -> zero-ref "field V18._lower (fldOffset=0x0)" P-INDEP
;* V85 tmp41 [V85 ] ( 0, 0 ) simd32 -> zero-ref "field V18._upper (fldOffset=0x20)" P-INDEP
;* V86 tmp42 [V86 ] ( 0, 0 ) simd32 -> zero-ref "field V19._lower (fldOffset=0x0)" P-INDEP
;* V87 tmp43 [V87 ] ( 0, 0 ) simd32 -> zero-ref "field V19._upper (fldOffset=0x20)" P-INDEP
; V88 cse0 [V88,T08] ( 7, 6.75) int -> r10 hoist multi-def "CSE #01: moderate"
;
; Lcl frame size = 0
G_M33471_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M33471_IG02:
cmp r8d, 8
jl G_M33471_IG10
;; size=10 bbWeight=1 PerfScore 1.25
G_M33471_IG03:
mov rax, rdi
cmp r8d, 16
- jg G_M33471_IG05
- movzx rsi, sil
+ jg SHORT G_M33471_IG05
+ ;; NOP compensation instructions of 4 bytes.
vmovd xmm0, esi
vpbroadcastb xmm0, xmm0
- movzx rdx, dl
vmovd xmm1, edx
vpbroadcastb xmm1, xmm1
- movzx rcx, cl
vmovd xmm2, ecx
vpbroadcastb xmm2, xmm2
add r8d, -8
- movsxd rax, r8d
- lea rax, bword ptr [rdi+2*rax]
+ movsxd rsi, r8d
+ lea rax, bword ptr [rdi+2*rsi]
cmp rdi, rax
- mov rcx, rdi
- cmova rcx, rax
- vmovups xmm3, xmmword ptr [rcx]
+ mov rdx, rdi
+ cmova rdx, rax
+ vmovups xmm3, xmmword ptr [rdx]
vpackuswb xmm3, xmm3, xmmword ptr [rax]
vpcmpeqb xmm0, xmm3, xmm0
vpcmpeqb xmm1, xmm3, xmm1
vpor xmm0, xmm1, xmm0
vpcmpeqb xmm1, xmm3, xmm2
vpor xmm0, xmm1, xmm0
vptest xmm0, xmm0
je G_M33471_IG16
- vpmovmskb edx, xmm0
- tzcnt edx, edx
- cmp edx, 8
+ vpmovmskb ecx, xmm0
+ tzcnt ecx, ecx
+ cmp ecx, 8
jl SHORT G_M33471_IG04
- mov rcx, rax
- add edx, -8
- ;; size=129 bbWeight=0.50 PerfScore 15.96
+ mov rdx, rax
+ add ecx, -8
+ ;; size=119 bbWeight=0.50 PerfScore 15.58
G_M33471_IG04:
- sub rcx, rdi
- shr rcx, 1
- lea eax, [rcx+rdx]
+ sub rdx, rdi
+ shr rdx, 1
+ lea eax, [rdx+rcx]
jmp G_M33471_IG19
- align [0 bytes for IG06]
- ;; size=14 bbWeight=0.50 PerfScore 1.62
+ align [1 bytes for IG06]
+ ;; size=15 bbWeight=0.50 PerfScore 1.62
G_M33471_IG05:
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb ymm0, ymm0
- movzx rdx, dl
vmovd xmm1, edx
vpbroadcastb ymm1, ymm1
- movzx rcx, cl
vmovd xmm2, ecx
vpbroadcastb ymm2, ymm2
cmp r8d, 32
jle SHORT G_M33471_IG07
- lea ecx, [r8-0x20]
- movsxd rcx, ecx
- lea rcx, bword ptr [rax+2*rcx]
- ;; size=54 bbWeight=0.50 PerfScore 7.62
+ lea esi, [r8-0x20]
+ movsxd rdx, esi
+ lea rcx, bword ptr [rax+2*rdx]
+ ;; size=44 bbWeight=0.50 PerfScore 7.25
G_M33471_IG06:
vmovups ymm3, ymmword ptr [rax]
vpackuswb ymm3, ymm3, ymmword ptr [rax+0x20]
vpcmpeqb ymm4, ymm3, ymm0
vpcmpeqb ymm5, ymm3, ymm1
vpor ymm4, ymm5, ymm4
vpcmpeqb ymm3, ymm3, ymm2
vpor ymm3, ymm3, ymm4
vptest ymm3, ymm3
jne SHORT G_M33471_IG09
add rax, 64
cmp rax, rcx
jb SHORT G_M33471_IG06
;; size=45 bbWeight=4 PerfScore 70.67
G_M33471_IG07:
add r8d, -16
movsxd rcx, r8d
lea rcx, bword ptr [rdi+2*rcx]
cmp rax, rcx
cmova rax, rcx
vmovups ymm3, ymmword ptr [rax]
vpackuswb ymm3, ymm3, ymmword ptr [rcx]
vpcmpeqb ymm0, ymm0, ymm3
vpcmpeqb ymm1, ymm1, ymm3
vpor ymm0, ymm1, ymm0
vpcmpeqb ymm1, ymm2, ymm3
vpor ymm0, ymm1, ymm0
vptest ymm0, ymm0
je G_M33471_IG16
vpermq ymm0, ymm0, -40
vpmovmskb edx, ymm0
tzcnt edx, edx
cmp edx, 16
jl SHORT G_M33471_IG08
mov rax, rcx
add edx, -16
;; size=82 bbWeight=0.50 PerfScore 13.21
G_M33471_IG08:
sub rax, rdi
shr rax, 1
add eax, edx
jmp G_M33471_IG19
- align [0 bytes for IG13]
- ;; size=13 bbWeight=0.50 PerfScore 1.50
+ align [4 bytes for IG13]
+ ;; size=17 bbWeight=0.50 PerfScore 1.50
G_M33471_IG09:
sub rax, rdi
shr rax, 1
vpermq ymm0, ymm3, -40
vpmovmskb edi, ymm0
xor ecx, ecx
tzcnt ecx, edi
add eax, ecx
jmp G_M33471_IG19
;; size=29 bbWeight=0.50 PerfScore 5.12
G_M33471_IG10:
xor r9d, r9d
cmp r8d, 4
jl G_M33471_IG11
add r8d, -4
movsx rax, word ptr [rdi]
movsx r10, si
cmp eax, r10d
je G_M33471_IG24
movsx r9, dx
cmp eax, r9d
je G_M33471_IG24
movsx r9, cx
cmp eax, r9d
sete al
movzx rax, al
test al, al
jne G_M33471_IG24
movsx rax, word ptr [rdi+0x02]
cmp eax, r10d
je G_M33471_IG22
movsx r9, dx
cmp eax, r9d
je G_M33471_IG22
movsx r9, cx
cmp eax, r9d
sete al
movzx rax, al
test al, al
jne G_M33471_IG22
movsx rax, word ptr [rdi+0x04]
cmp eax, r10d
je G_M33471_IG20
movsx r9, dx
cmp eax, r9d
je G_M33471_IG20
movsx r9, cx
cmp eax, r9d
sete al
movzx rax, al
test al, al
jne SHORT G_M33471_IG20
movsx rax, word ptr [rdi+0x06]
cmp eax, r10d
je SHORT G_M33471_IG18
movsx r10, dx
cmp eax, r10d
je SHORT G_M33471_IG18
movsx r9, cx
cmp eax, r9d
sete al
movzx rax, al
test al, al
jne SHORT G_M33471_IG18
mov r9d, 4
;; size=202 bbWeight=0.50 PerfScore 20.62
G_M33471_IG11:
test r8d, r8d
jle SHORT G_M33471_IG16
;; size=5 bbWeight=0.50 PerfScore 0.62
G_M33471_IG12:
movsx r10, si
;; size=4 bbWeight=0.25 PerfScore 0.06
G_M33471_IG13:
dec r8d
movsx rax, word ptr [rdi+2*r9]
cmp eax, r10d
je SHORT G_M33471_IG17
;; size=13 bbWeight=4 PerfScore 22.00
G_M33471_IG14:
movsx rsi, dx
cmp eax, esi
je SHORT G_M33471_IG17
movsx rsi, cx
cmp eax, esi
sete al
movzx rax, al
test al, al
jne SHORT G_M33471_IG17
;; size=24 bbWeight=2 PerfScore 9.00
G_M33471_IG15:
inc r9
test r8d, r8d
jg SHORT G_M33471_IG13
;; size=8 bbWeight=4 PerfScore 6.00
G_M33471_IG16:
mov eax, -1
jmp SHORT G_M33471_IG19
;; size=7 bbWeight=0.50 PerfScore 1.12
G_M33471_IG17:
mov eax, r9d
jmp SHORT G_M33471_IG19
;; size=5 bbWeight=0.50 PerfScore 1.12
G_M33471_IG18:
mov eax, 3
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M33471_IG19:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M33471_IG20:
mov eax, 2
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M33471_IG21:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M33471_IG22:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M33471_IG23:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M33471_IG24:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M33471_IG25:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
-; Total bytes of code 685, prolog size 4, PerfScore 184.27, instruction count 195, allocated bytes for code 685 (MethodHash=2ec77d40) for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (FullOpts)
+; Total bytes of code 670, prolog size 4, PerfScore 183.52, instruction count 189, allocated bytes for code 670 (MethodHash=2ec77d40) for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (FullOpts)
; ============================================================
-12 (-1.85 % of base) - System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort)
; Assembly listing for method System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 5 single block inlinees; 3 inlinees without PGO data
; Final local variable assignments
;
;* V00 arg0 [V00 ] ( 0, 0 ) struct (16) zero-ref multireg-arg ld-addr-op single-def <System.ReadOnlySpan`1[ushort]>
; V01 arg1 [V01,T00] ( 17,140 ) byref -> rbx single-def
; V02 arg2 [V02,T20] ( 4, 3 ) ushort -> rcx single-def
; V03 arg3 [V03,T17] ( 5, 5 ) ushort -> r15 single-def
; V04 arg4 [V04,T18] ( 5, 5 ) ushort -> r14 single-def
; V05 loc0 [V05,T12] ( 5, 18 ) long -> r13
; V06 loc1 [V06,T01] ( 16, 87 ) long -> r12
; V07 loc2 [V07,T13] ( 4, 13 ) byref -> [rbp-0xE0] spill-single-def
;* V08 loc3 [V08 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V09 loc4 [V09 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V10 loc5 [V10 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V11 loc6 [V11 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V12 loc7 [V12 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V13 loc8 [V13 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ushort]>
;* V14 loc9 [V14 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V15 loc10 [V15 ] ( 0, 0 ) long -> zero-ref
;* V16 loc11 [V16 ] ( 0, 0 ) int -> zero-ref
; V17 loc12 [V17,T29] ( 2, 4.50) simd32 -> [rbp-0x50] spill-single-def <System.Runtime.Intrinsics.Vector256`1[ushort]>
; V18 loc13 [V18,T30] ( 2, 4.50) simd32 -> [rbp-0x70] spill-single-def <System.Runtime.Intrinsics.Vector256`1[ushort]>
; V19 loc14 [V19,T31] ( 2, 4.50) simd32 -> [rbp-0x90] spill-single-def <System.Runtime.Intrinsics.Vector256`1[ushort]>
;* V20 loc15 [V20 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ushort]>
;* V21 loc16 [V21 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ushort]>
;* V22 loc17 [V22 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ushort]>
; V23 loc18 [V23,T27] ( 4, 14 ) simd32 -> mm3 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V24 loc19 [V24,T02] ( 5, 66 ) int -> [rbp-0x94]
;* V25 loc20 [V25 ] ( 0, 0 ) int -> zero-ref
; V26 loc21 [V26,T32] ( 2, 4.50) simd16 -> [rbp-0xB0] spill-single-def <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V27 loc22 [V27,T33] ( 2, 4.50) simd16 -> [rbp-0xC0] spill-single-def <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V28 loc23 [V28,T34] ( 2, 4.50) simd16 -> [rbp-0xD0] spill-single-def <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V29 loc24 [V29 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V30 loc25 [V30 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V31 loc26 [V31 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V32 loc27 [V32,T28] ( 4, 14 ) simd16 -> mm3 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V33 loc28 [V33,T03] ( 5, 66 ) int -> [rbp-0xD4]
;* V34 loc29 [V34 ] ( 0, 0 ) int -> zero-ref
; V35 loc30 [V35,T14] ( 4, 12 ) ushort -> rdi
;# V36 OutArgs [V36 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V37 tmp1 [V37,T25] ( 4, 32 ) simd16 -> mm3 "dup spill"
; V38 tmp2 [V38,T26] ( 4, 32 ) simd32 -> mm3 "dup spill"
;* V39 tmp3 [V39 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ushort]>
; V40 tmp4 [V40,T06] ( 4, 48 ) int -> r8 "Inline stloc first use temp"
;* V41 tmp5 [V41 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inline stloc first use temp" <System.Span`1[int]>
; V42 tmp6 [V42,T04] ( 3, 64 ) int -> rsi "Inlining Arg"
; V43 tmp7 [V43,T07] ( 4, 48 ) int -> r8 "Inline stloc first use temp"
;* V44 tmp8 [V44 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inline stloc first use temp" <System.Span`1[int]>
; V45 tmp9 [V45,T05] ( 3, 64 ) int -> rsi "Inlining Arg"
; V46 tmp10 [V46,T16] ( 4, 8 ) int -> rcx "Inline stloc first use temp"
;* V47 tmp11 [V47 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inline stloc first use temp" <System.Span`1[int]>
; V48 tmp12 [V48,T15] ( 3, 12 ) int -> rsi "Inlining Arg"
; V49 tmp13 [V49,T21] ( 2, 2 ) byref -> rdi single-def "field V00._reference (fldOffset=0x0)" P-INDEP
; V50 tmp14 [V50,T23] ( 2, 2 ) int -> rsi single-def "field V00._length (fldOffset=0x8)" P-INDEP
;* V51 tmp15 [V51 ] ( 0, 0 ) simd32 -> zero-ref "field V08._lower (fldOffset=0x0)" P-INDEP
;* V52 tmp16 [V52 ] ( 0, 0 ) simd32 -> zero-ref "field V08._upper (fldOffset=0x20)" P-INDEP
;* V53 tmp17 [V53 ] ( 0, 0 ) simd32 -> zero-ref "field V09._lower (fldOffset=0x0)" P-INDEP
;* V54 tmp18 [V54 ] ( 0, 0 ) simd32 -> zero-ref "field V09._upper (fldOffset=0x20)" P-INDEP
;* V55 tmp19 [V55 ] ( 0, 0 ) simd32 -> zero-ref "field V10._lower (fldOffset=0x0)" P-INDEP
;* V56 tmp20 [V56 ] ( 0, 0 ) simd32 -> zero-ref "field V10._upper (fldOffset=0x20)" P-INDEP
;* V57 tmp21 [V57 ] ( 0, 0 ) simd32 -> zero-ref "field V11._lower (fldOffset=0x0)" P-INDEP
;* V58 tmp22 [V58 ] ( 0, 0 ) simd32 -> zero-ref "field V11._upper (fldOffset=0x20)" P-INDEP
;* V59 tmp23 [V59 ] ( 0, 0 ) simd32 -> zero-ref "field V12._lower (fldOffset=0x0)" P-INDEP
;* V60 tmp24 [V60 ] ( 0, 0 ) simd32 -> zero-ref "field V12._upper (fldOffset=0x20)" P-INDEP
;* V61 tmp25 [V61 ] ( 0, 0 ) simd32 -> zero-ref "field V13._lower (fldOffset=0x0)" P-INDEP
;* V62 tmp26 [V62 ] ( 0, 0 ) simd32 -> zero-ref "field V13._upper (fldOffset=0x20)" P-INDEP
;* V63 tmp27 [V63 ] ( 0, 0 ) simd32 -> zero-ref "field V14._lower (fldOffset=0x0)" P-INDEP
;* V64 tmp28 [V64 ] ( 0, 0 ) simd32 -> zero-ref "field V14._upper (fldOffset=0x20)" P-INDEP
;* V65 tmp29 [V65 ] ( 0, 0 ) byref -> zero-ref single-def "field V39._reference (fldOffset=0x0)" P-INDEP
;* V66 tmp30 [V66 ] ( 0, 0 ) int -> zero-ref "field V39._length (fldOffset=0x8)" P-INDEP
; V67 tmp31 [V67,T10] ( 2, 24 ) byref -> r9 "field V41._reference (fldOffset=0x0)" P-INDEP
; V68 tmp32 [V68,T08] ( 2, 32 ) int -> rdi "field V41._length (fldOffset=0x8)" P-INDEP
; V69 tmp33 [V69,T11] ( 2, 24 ) byref -> r9 "field V44._reference (fldOffset=0x0)" P-INDEP
; V70 tmp34 [V70,T09] ( 2, 32 ) int -> rdi "field V44._length (fldOffset=0x8)" P-INDEP
; V71 tmp35 [V71,T22] ( 2, 4 ) byref -> r8 "field V47._reference (fldOffset=0x0)" P-INDEP
; V72 tmp36 [V72,T24] ( 2, 4 ) int -> rdi "field V47._length (fldOffset=0x8)" P-INDEP
; V73 cse0 [V73,T19] ( 5, 6 ) int -> [rbp-0xD8] multi-def "CSE #01: moderate"
;
; Lcl frame size = 184
G_M10293_IG01:
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 184
lea rbp, [rsp+0xE0]
mov rbx, rdx
mov r15d, r8d
mov r14d, r9d
;; size=34 bbWeight=1 PerfScore 7.50
G_M10293_IG02:
mov r13d, esi
xor r12d, r12d
mov rax, rdi
mov bword ptr [rbp-0xE0], rax
cmp r13, 32
jae G_M10293_IG12
;; size=26 bbWeight=1 PerfScore 3.00
G_M10293_IG03:
movzx rdx, cx
mov dword ptr [rbp-0xD8], edx
vmovd xmm0, edx
vpbroadcastw xmm0, xmm0
vmovaps xmmword ptr [rbp-0xB0], xmm0
- movzx rdi, r15w
- vmovd xmm1, edi
+ vmovd xmm1, r15d
vpbroadcastw xmm1, xmm1
vmovaps xmmword ptr [rbp-0xC0], xmm1
- movzx rdi, r14w
- vmovd xmm2, edi
+ vmovd xmm2, r14d
vpbroadcastw xmm2, xmm2
vmovaps xmmword ptr [rbp-0xD0], xmm2
jmp SHORT G_M10293_IG05
- ;; size=70 bbWeight=0.50 PerfScore 7.88
+ ;; size=64 bbWeight=0.50 PerfScore 7.62
G_M10293_IG04:
mov rax, bword ptr [rbp-0xE0]
;; size=7 bbWeight=2 PerfScore 2.00
G_M10293_IG05:
vmovups xmm3, xmmword ptr [rax+2*r12]
vpcmpeqw xmm4, xmm3, xmm0
vpcmpeqw xmm5, xmm3, xmm1
vpor xmm4, xmm5, xmm4
vpcmpeqw xmm3, xmm3, xmm2
vpor xmm3, xmm3, xmm4
vptest xmm3, xmm3
je SHORT G_M10293_IG11
;; size=33 bbWeight=4 PerfScore 40.67
G_M10293_IG06:
vpmovmskb ecx, xmm3
and ecx, 0x5555
;; size=10 bbWeight=2 PerfScore 4.50
G_M10293_IG07:
mov dword ptr [rbp-0xD4], ecx
xor edi, edi
tzcnt edi, ecx
shr edi, 1
mov esi, edi
add esi, r12d
mov r8d, dword ptr [rbx+0x08]
mov r9, bword ptr [rbx+0x10]
mov edi, dword ptr [rbx+0x18]
cmp r8d, edi
jb SHORT G_M10293_IG09
;; size=35 bbWeight=16 PerfScore 184.00
G_M10293_IG08:
mov rdi, rbx
mov r8, 0xD1FFAB1E ; code for System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
call [r8]System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
jmp SHORT G_M10293_IG10
;; size=18 bbWeight=8 PerfScore 44.00
G_M10293_IG09:
mov edi, r8d
mov dword ptr [r9+4*rdi], esi
inc r8d
mov dword ptr [rbx+0x08], r8d
;; size=14 bbWeight=8 PerfScore 20.00
G_M10293_IG10:
blsr ecx, dword ptr [rbp-0xD4]
jne SHORT G_M10293_IG07
;; size=11 bbWeight=16 PerfScore 48.00
G_M10293_IG11:
add r12, 8
lea rdi, [r13-0x08]
cmp r12, rdi
vmovaps xmm0, xmmword ptr [rbp-0xB0]
vmovaps xmm1, xmmword ptr [rbp-0xC0]
vmovaps xmm2, xmmword ptr [rbp-0xD0]
jbe G_M10293_IG04
jmp G_M10293_IG22
;; size=46 bbWeight=4 PerfScore 52.00
G_M10293_IG12:
movzx rdx, cx
mov dword ptr [rbp-0xD8], edx
vmovd xmm0, edx
vpbroadcastw ymm0, ymm0
vmovups ymmword ptr [rbp-0x50], ymm0
- movzx rdi, r15w
- vmovd xmm1, edi
+ vmovd xmm1, r15d
vpbroadcastw ymm1, ymm1
vmovups ymmword ptr [rbp-0x70], ymm1
- movzx rdi, r14w
- vmovd xmm2, edi
+ vmovd xmm2, r14d
vpbroadcastw ymm2, ymm2
vmovups ymmword ptr [rbp-0x90], ymm2
jmp SHORT G_M10293_IG14
- ;; size=64 bbWeight=0.50 PerfScore 9.38
+ ;; size=58 bbWeight=0.50 PerfScore 9.12
G_M10293_IG13:
mov rax, bword ptr [rbp-0xE0]
;; size=7 bbWeight=2 PerfScore 2.00
G_M10293_IG14:
vmovups ymm3, ymmword ptr [rax+2*r12]
vpcmpeqw ymm4, ymm3, ymm0
vpcmpeqw ymm5, ymm3, ymm1
vpor ymm4, ymm5, ymm4
vpcmpeqw ymm3, ymm3, ymm2
vpor ymm3, ymm3, ymm4
vptest ymm3, ymm3
je SHORT G_M10293_IG20
;; size=33 bbWeight=4 PerfScore 52.67
G_M10293_IG15:
vpmovmskb ecx, ymm3
and ecx, 0xD1FFAB1E
;; size=10 bbWeight=2 PerfScore 6.50
G_M10293_IG16:
mov dword ptr [rbp-0x94], ecx
xor edi, edi
tzcnt edi, ecx
shr edi, 1
mov esi, edi
add esi, r12d
mov r8d, dword ptr [rbx+0x08]
mov r9, bword ptr [rbx+0x10]
mov edi, dword ptr [rbx+0x18]
cmp r8d, edi
jb SHORT G_M10293_IG18
;; size=35 bbWeight=16 PerfScore 184.00
G_M10293_IG17:
mov rdi, rbx
mov r8, 0xD1FFAB1E ; code for System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
call [r8]System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
jmp SHORT G_M10293_IG19
;; size=18 bbWeight=8 PerfScore 44.00
G_M10293_IG18:
mov edi, r8d
mov dword ptr [r9+4*rdi], esi
inc r8d
mov dword ptr [rbx+0x08], r8d
;; size=14 bbWeight=8 PerfScore 20.00
G_M10293_IG19:
blsr ecx, dword ptr [rbp-0x94]
jne SHORT G_M10293_IG16
;; size=11 bbWeight=16 PerfScore 48.00
G_M10293_IG20:
add r12, 16
lea rdi, [r13-0x10]
cmp r12, rdi
vmovups ymm0, ymmword ptr [rbp-0x50]
vmovups ymm1, ymmword ptr [rbp-0x70]
vmovups ymm2, ymmword ptr [rbp-0x90]
jbe G_M10293_IG13
jmp SHORT G_M10293_IG22
;; size=37 bbWeight=4 PerfScore 64.00
G_M10293_IG21:
inc r12
;; size=3 bbWeight=4 PerfScore 1.00
G_M10293_IG22:
cmp r12, r13
jae SHORT G_M10293_IG27
;; size=5 bbWeight=8 PerfScore 10.00
G_M10293_IG23:
mov rax, bword ptr [rbp-0xE0]
movzx rdi, word ptr [rax+2*r12]
mov edx, dword ptr [rbp-0xD8]
cmp edi, edx
je SHORT G_M10293_IG25
;; size=22 bbWeight=4 PerfScore 21.00
G_M10293_IG24:
movzx rsi, r15w
cmp edi, esi
je SHORT G_M10293_IG25
movzx rsi, r14w
cmp edi, esi
jne SHORT G_M10293_IG21
;; size=16 bbWeight=2 PerfScore 6.00
G_M10293_IG25:
mov esi, r12d
mov ecx, dword ptr [rbx+0x08]
mov r8, bword ptr [rbx+0x10]
mov edi, dword ptr [rbx+0x18]
cmp ecx, edi
jb SHORT G_M10293_IG26
mov rdi, rbx
mov rcx, 0xD1FFAB1E ; code for System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
call [rcx]System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
jmp SHORT G_M10293_IG21
;; size=34 bbWeight=2 PerfScore 26.00
G_M10293_IG26:
mov edi, ecx
mov dword ptr [r8+4*rdi], esi
inc ecx
mov dword ptr [rbx+0x08], ecx
jmp SHORT G_M10293_IG21
;; size=13 bbWeight=2 PerfScore 9.00
G_M10293_IG27:
vzeroupper
add rsp, 184
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
;; size=21 bbWeight=1 PerfScore 5.25
-; Total bytes of code 647, prolog size 34, PerfScore 922.33, instruction count 163, allocated bytes for code 647 (MethodHash=8ebed7ca) for method System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort) (FullOpts)
+; Total bytes of code 635, prolog size 34, PerfScore 921.83, instruction count 159, allocated bytes for code 635 (MethodHash=8ebed7ca) for method System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort) (FullOpts)
; ============================================================
-10 (-1.78 % of base) - System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,short,int):int
; Assembly listing for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,short,int):int (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 19 single block inlinees; 5 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T03] ( 15, 12 ) byref -> rdi single-def
; V01 arg1 [V01,T09] ( 6, 3.75) short -> rsi single-def
; V02 arg2 [V02,T05] ( 9, 7 ) short -> rdx single-def
; V03 arg3 [V03,T01] ( 15, 19.50) int -> rcx
; V04 loc0 [V04,T04] ( 6, 13.50) long -> r8
; V05 loc1 [V05,T02] ( 15, 16 ) short -> rax
; V06 loc2 [V06,T00] ( 10, 22.50) byref -> rax
;* V07 loc3 [V07 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V08 loc4 [V08 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V09 loc5 [V09 ] ( 0, 0 ) byref -> zero-ref
;* V10 loc6 [V10 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V11 loc7 [V11 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V12 loc8 [V12 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V13 loc9 [V13 ] ( 0, 0 ) byref -> zero-ref
;* V14 loc10 [V14 ] ( 0, 0 ) byref -> zero-ref
;* V15 loc11 [V15 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V16 loc12 [V16 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V17 loc13 [V17 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V18 loc14 [V18,T25] ( 3, 5 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V19 loc15 [V19,T26] ( 3, 5 ) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V20 loc16 [V20,T10] ( 2, 4.50) byref -> rdx single-def
;* V21 loc17 [V21 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
; V22 loc18 [V22,T24] ( 3, 12 ) simd32 -> mm2 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V23 loc19 [V23,T23] ( 4, 12.50) simd32 -> mm2 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V24 loc20 [V24,T13] ( 5, 2.50) byref -> rcx single-def
;* V25 loc21 [V25 ] ( 0, 0 ) byref -> zero-ref single-def
;* V26 loc22 [V26 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
; V27 loc23 [V27,T29] ( 3, 1.50) simd32 -> mm2 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V28 loc24 [V28,T27] ( 4, 2 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V29 loc25 [V29,T31] ( 2, 1 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V30 loc26 [V30,T32] ( 2, 1 ) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V31 loc27 [V31 ] ( 0, 0 ) byref -> zero-ref
;* V32 loc28 [V32 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;* V33 loc29 [V33 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V34 loc30 [V34 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V35 loc31 [V35,T14] ( 5, 2.50) byref -> rax single-def
;* V36 loc32 [V36 ] ( 0, 0 ) byref -> zero-ref single-def
;* V37 loc33 [V37 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
; V38 loc34 [V38,T30] ( 3, 1.50) simd16 -> mm2 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V39 loc35 [V39,T28] ( 4, 2 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;# V40 OutArgs [V40 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V41 tmp1 [V41,T17] ( 3, 1.50) byref -> rcx
+; V41 tmp1 [V41,T17] ( 3, 1.50) byref -> rdx
;* V42 tmp2 [V42 ] ( 0, 0 ) simd16 -> zero-ref "impAppendStmt"
;* V43 tmp3 [V43 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
; V44 tmp4 [V44,T18] ( 3, 1.50) byref -> rax
;* V45 tmp5 [V45 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V46 tmp6 [V46 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V47 tmp7 [V47 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V48 tmp8 [V48 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V49 tmp9 [V49,T08] ( 2, 4 ) int -> rax
; V50 tmp10 [V50,T19] ( 2, 1 ) int -> rax
; V51 tmp11 [V51,T20] ( 2, 1 ) int -> rax
; V52 tmp12 [V52,T21] ( 2, 1 ) int -> rax
; V53 tmp13 [V53,T22] ( 2, 1 ) int -> rax
;* V54 tmp14 [V54 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V55 tmp15 [V55 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V56 tmp16 [V56 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V57 tmp17 [V57 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V58 tmp18 [V58 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V59 tmp19 [V59 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V60 tmp20 [V60 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V61 tmp21 [V61 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V62 tmp22 [V62 ] ( 0, 0 ) short -> zero-ref "Inlining Arg"
;* V63 tmp23 [V63 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V64 tmp24 [V64 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V65 tmp25 [V65 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V66 tmp26 [V66 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V67 tmp27 [V67 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V68 tmp28 [V68,T15] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
; V69 tmp29 [V69,T11] ( 3, 3 ) byref -> rax "Inlining Arg"
;* V70 tmp30 [V70 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V71 tmp31 [V71,T16] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
-; V72 tmp32 [V72,T12] ( 3, 3 ) byref -> rcx "Inlining Arg"
+; V71 tmp31 [V71,T16] ( 5, 2.50) int -> rcx "Inline stloc first use temp"
+; V72 tmp32 [V72,T12] ( 3, 3 ) byref -> rdx "Inlining Arg"
; V73 tmp33 [V73,T06] ( 7, 7 ) int -> rax "Single return block return value"
;* V74 tmp34 [V74 ] ( 0, 0 ) simd32 -> zero-ref "field V07._lower (fldOffset=0x0)" P-INDEP
;* V75 tmp35 [V75 ] ( 0, 0 ) simd32 -> zero-ref "field V07._upper (fldOffset=0x20)" P-INDEP
;* V76 tmp36 [V76 ] ( 0, 0 ) simd32 -> zero-ref "field V08._lower (fldOffset=0x0)" P-INDEP
;* V77 tmp37 [V77 ] ( 0, 0 ) simd32 -> zero-ref "field V08._upper (fldOffset=0x20)" P-INDEP
;* V78 tmp38 [V78 ] ( 0, 0 ) simd32 -> zero-ref "field V10._lower (fldOffset=0x0)" P-INDEP
;* V79 tmp39 [V79 ] ( 0, 0 ) simd32 -> zero-ref "field V10._upper (fldOffset=0x20)" P-INDEP
;* V80 tmp40 [V80 ] ( 0, 0 ) simd32 -> zero-ref "field V11._lower (fldOffset=0x0)" P-INDEP
;* V81 tmp41 [V81 ] ( 0, 0 ) simd32 -> zero-ref "field V11._upper (fldOffset=0x20)" P-INDEP
;* V82 tmp42 [V82 ] ( 0, 0 ) simd32 -> zero-ref "field V12._lower (fldOffset=0x0)" P-INDEP
;* V83 tmp43 [V83 ] ( 0, 0 ) simd32 -> zero-ref "field V12._upper (fldOffset=0x20)" P-INDEP
;* V84 tmp44 [V84 ] ( 0, 0 ) simd32 -> zero-ref "field V15._lower (fldOffset=0x0)" P-INDEP
;* V85 tmp45 [V85 ] ( 0, 0 ) simd32 -> zero-ref "field V15._upper (fldOffset=0x20)" P-INDEP
;* V86 tmp46 [V86 ] ( 0, 0 ) simd32 -> zero-ref "field V16._lower (fldOffset=0x0)" P-INDEP
;* V87 tmp47 [V87 ] ( 0, 0 ) simd32 -> zero-ref "field V16._upper (fldOffset=0x20)" P-INDEP
;* V88 tmp48 [V88 ] ( 0, 0 ) simd32 -> zero-ref "field V17._lower (fldOffset=0x0)" P-INDEP
;* V89 tmp49 [V89 ] ( 0, 0 ) simd32 -> zero-ref "field V17._upper (fldOffset=0x20)" P-INDEP
; V90 cse0 [V90,T07] ( 7, 6.75) int -> r9 hoist multi-def "CSE #01: moderate"
;
; Lcl frame size = 0
G_M59761_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M59761_IG02:
cmp ecx, 8
jl G_M59761_IG10
;; size=9 bbWeight=1 PerfScore 1.25
G_M59761_IG03:
mov rax, rdi
cmp ecx, 16
jg SHORT G_M59761_IG05
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb xmm0, xmm0
- movzx rdx, dl
vmovd xmm1, edx
vpbroadcastb xmm1, xmm1
add ecx, -8
- movsxd rax, ecx
- lea rax, bword ptr [rdi+2*rax]
+ movsxd rsi, ecx
+ lea rax, bword ptr [rdi+2*rsi]
cmp rdi, rax
- mov rcx, rdi
- cmova rcx, rax
- vmovups xmm2, xmmword ptr [rcx]
+ mov rdx, rdi
+ cmova rdx, rax
+ vmovups xmm2, xmmword ptr [rdx]
vpackuswb xmm2, xmm2, xmmword ptr [rax]
vpcmpeqb xmm0, xmm2, xmm0
vpcmpeqb xmm1, xmm2, xmm1
vpor xmm0, xmm1, xmm0
vptest xmm0, xmm0
je G_M59761_IG16
- vpmovmskb edx, xmm0
- tzcnt edx, edx
- cmp edx, 8
+ vpmovmskb ecx, xmm0
+ tzcnt ecx, ecx
+ cmp ecx, 8
jl SHORT G_M59761_IG04
- mov rcx, rax
- add edx, -8
- ;; size=103 bbWeight=0.50 PerfScore 13.92
+ mov rdx, rax
+ add ecx, -8
+ ;; size=96 bbWeight=0.50 PerfScore 13.67
G_M59761_IG04:
- sub rcx, rdi
- shr rcx, 1
- lea eax, [rcx+rdx]
+ sub rdx, rdi
+ shr rdx, 1
+ lea eax, [rdx+rcx]
jmp G_M59761_IG19
- align [0 bytes for IG06]
- ;; size=14 bbWeight=0.50 PerfScore 1.62
+ align [4 bytes for IG06]
+ ;; size=18 bbWeight=0.50 PerfScore 1.62
G_M59761_IG05:
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb ymm0, ymm0
- movzx rdx, dl
vmovd xmm1, edx
vpbroadcastb ymm1, ymm1
cmp ecx, 32
jle SHORT G_M59761_IG07
- lea edx, [rcx-0x20]
- movsxd rdx, edx
+ lea esi, [rcx-0x20]
+ movsxd rdx, esi
lea rdx, bword ptr [rax+2*rdx]
- ;; size=40 bbWeight=0.50 PerfScore 5.50
+ ;; size=33 bbWeight=0.50 PerfScore 5.25
G_M59761_IG06:
vmovups ymm2, ymmword ptr [rax]
vpackuswb ymm2, ymm2, ymmword ptr [rax+0x20]
vpcmpeqb ymm3, ymm2, ymm0
vpcmpeqb ymm2, ymm2, ymm1
vpor ymm2, ymm2, ymm3
vptest ymm2, ymm2
jne SHORT G_M59761_IG09
add rax, 64
cmp rax, rdx
jb SHORT G_M59761_IG06
;; size=37 bbWeight=4 PerfScore 67.33
G_M59761_IG07:
add ecx, -16
movsxd rcx, ecx
lea rcx, bword ptr [rdi+2*rcx]
cmp rax, rcx
cmova rax, rcx
vmovups ymm2, ymmword ptr [rax]
vpackuswb ymm2, ymm2, ymmword ptr [rcx]
vpcmpeqb ymm0, ymm0, ymm2
vpcmpeqb ymm1, ymm1, ymm2
vpor ymm0, ymm1, ymm0
vptest ymm0, ymm0
je G_M59761_IG16
vpermq ymm0, ymm0, -40
vpmovmskb edx, ymm0
tzcnt edx, edx
cmp edx, 16
jl SHORT G_M59761_IG08
mov rax, rcx
add edx, -16
;; size=73 bbWeight=0.50 PerfScore 12.79
G_M59761_IG08:
sub rax, rdi
shr rax, 1
add eax, edx
jmp G_M59761_IG19
align [0 bytes for IG13]
;; size=13 bbWeight=0.50 PerfScore 1.50
G_M59761_IG09:
sub rax, rdi
shr rax, 1
vpermq ymm0, ymm2, -40
vpmovmskb edi, ymm0
xor ecx, ecx
tzcnt ecx, edi
add eax, ecx
jmp G_M59761_IG19
;; size=29 bbWeight=0.50 PerfScore 5.12
G_M59761_IG10:
xor r8d, r8d
cmp ecx, 4
jl G_M59761_IG11
add ecx, -4
movsx rax, word ptr [rdi]
movsx r9, si
cmp eax, r9d
je G_M59761_IG24
movsx r8, dx
cmp eax, r8d
sete al
movzx rax, al
test al, al
jne G_M59761_IG24
movsx rax, word ptr [rdi+0x02]
cmp eax, r9d
je G_M59761_IG22
movsx r8, dx
cmp eax, r8d
sete al
movzx rax, al
test al, al
jne G_M59761_IG22
movsx rax, word ptr [rdi+0x04]
cmp eax, r9d
je SHORT G_M59761_IG20
movsx r8, dx
cmp eax, r8d
sete al
movzx rax, al
test al, al
jne SHORT G_M59761_IG20
movsx rax, word ptr [rdi+0x06]
cmp eax, r9d
je SHORT G_M59761_IG18
movsx r9, dx
cmp eax, r9d
sete al
movzx rax, al
test al, al
jne SHORT G_M59761_IG18
mov r8d, 4
;; size=148 bbWeight=0.50 PerfScore 17.62
G_M59761_IG11:
test ecx, ecx
jle SHORT G_M59761_IG16
;; size=4 bbWeight=0.50 PerfScore 0.62
G_M59761_IG12:
movsx r9, si
;; size=4 bbWeight=0.25 PerfScore 0.06
G_M59761_IG13:
dec ecx
movsx rax, word ptr [rdi+2*r8]
cmp eax, r9d
je SHORT G_M59761_IG17
;; size=12 bbWeight=4 PerfScore 22.00
G_M59761_IG14:
movsx rsi, dx
cmp eax, esi
sete al
movzx rax, al
test al, al
jne SHORT G_M59761_IG17
;; size=16 bbWeight=2 PerfScore 6.00
G_M59761_IG15:
inc r8
test ecx, ecx
jg SHORT G_M59761_IG13
;; size=7 bbWeight=4 PerfScore 6.00
G_M59761_IG16:
mov eax, -1
jmp SHORT G_M59761_IG19
;; size=7 bbWeight=0.50 PerfScore 1.12
G_M59761_IG17:
mov eax, r8d
jmp SHORT G_M59761_IG19
;; size=5 bbWeight=0.50 PerfScore 1.12
G_M59761_IG18:
mov eax, 3
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M59761_IG19:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M59761_IG20:
mov eax, 2
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M59761_IG21:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M59761_IG22:
mov eax, 1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M59761_IG23:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M59761_IG24:
xor eax, eax
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M59761_IG25:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
-; Total bytes of code 562, prolog size 4, PerfScore 170.35, instruction count 168, allocated bytes for code 566 (MethodHash=49ec168e) for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,short,int):int (FullOpts)
+; Total bytes of code 552, prolog size 4, PerfScore 169.85, instruction count 164, allocated bytes for code 552 (MethodHash=49ec168e) for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,short,int):int (FullOpts)
; ============================================================
-8 (-16.33 % of base) - System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short]
; Assembly listing for method System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 3, 3 ) short -> rsi single-def
; V02 arg1 [V02,T02] ( 3, 3 ) short -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M54365_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M54365_IG02:
- movsx rax, dx
- vmovd xmm0, eax
+ vmovd xmm0, edx
vpbroadcastw ymm0, ymm0
vpmullw ymm0, ymm0, ymmword ptr [reloc @RWD00]
- movsx rax, si
- vmovd xmm1, eax
+ vmovd xmm1, esi
vpbroadcastw ymm1, ymm1
vpaddw ymm0, ymm1, ymm0
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=45 bbWeight=1 PerfScore 17.08
+ ;; size=37 bbWeight=1 PerfScore 16.58
G_M54365_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
RWD00 dq 0003000200010000h, 0007000600050004h, 000B000A00090008h, 000F000E000D000Ch
-; Total bytes of code 49, prolog size 0, PerfScore 19.08, instruction count 12, allocated bytes for code 49 (MethodHash=28852ba2) for method System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 41, prolog size 0, PerfScore 18.58, instruction count 10, allocated bytes for code 41 (MethodHash=28852ba2) for method System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short] (FullOpts)
; ============================================================
-8 (-17.39 % of base) - System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short]
; Assembly listing for method System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 3, 3 ) short -> rsi single-def
; V02 arg1 [V02,T02] ( 3, 3 ) short -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M32125_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M32125_IG02:
- movsx rax, dx
- vmovd xmm0, eax
+ vmovd xmm0, edx
vpbroadcastw xmm0, xmm0
vpmullw xmm0, xmm0, xmmword ptr [reloc @RWD00]
- movsx rax, si
- vmovd xmm1, eax
+ vmovd xmm1, esi
vpbroadcastw xmm1, xmm1
vpaddw xmm0, xmm1, xmm0
vmovups xmmword ptr [rdi], xmm0
mov rax, rdi
- ;; size=45 bbWeight=1 PerfScore 15.08
+ ;; size=37 bbWeight=1 PerfScore 14.58
G_M32125_IG03:
ret
;; size=1 bbWeight=1 PerfScore 1.00
RWD00 dq 0003000200010000h, 0007000600050004h
-; Total bytes of code 46, prolog size 0, PerfScore 16.08, instruction count 11, allocated bytes for code 46 (MethodHash=a9108282) for method System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short] (FullOpts)
+; Total bytes of code 38, prolog size 0, PerfScore 15.58, instruction count 9, allocated bytes for code 38 (MethodHash=a9108282) for method System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short] (FullOpts)
; ============================================================
-8 (-16.33 % of base) - System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short]
; Assembly listing for method System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 3, 3 ) short -> rsi single-def
; V02 arg1 [V02,T02] ( 3, 3 ) short -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M9853_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M9853_IG02:
- movsx rax, dx
- vmovd xmm0, eax
+ vmovd xmm0, edx
vpbroadcastw ymm0, ymm0
vpmullw ymm0, ymm0, ymmword ptr [reloc @RWD00]
- movsx rax, si
- vmovd xmm1, eax
+ vmovd xmm1, esi
vpbroadcastw ymm1, ymm1
vpaddw ymm0, ymm1, ymm0
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=45 bbWeight=1 PerfScore 17.08
+ ;; size=37 bbWeight=1 PerfScore 16.58
G_M9853_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
RWD00 dq 0003000200010000h, 0007000600050004h, 000B000A00090008h, 000F000E000D000Ch
-; Total bytes of code 49, prolog size 0, PerfScore 19.08, instruction count 12, allocated bytes for code 49 (MethodHash=587dd982) for method System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short] (FullOpts)
+; Total bytes of code 41, prolog size 0, PerfScore 18.58, instruction count 10, allocated bytes for code 41 (MethodHash=587dd982) for method System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short] (FullOpts)
; ============================================================
-8 (-7.69 % of base) - System.Runtime.Intrinsics.Vector512:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector512`1[short]
; Assembly listing for method System.Runtime.Intrinsics.Vector512:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector512`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 14 single block inlinees; 7 inlinees without PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T01] ( 5, 5 ) byref -> rdi single-def
; V01 arg0 [V01,T02] ( 3, 3 ) short -> rsi single-def
; V02 arg1 [V02,T03] ( 3, 3 ) short -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V04 tmp1 [V04 ] ( 0, 0 ) struct (64) zero-ref "spilled call-like call argument" <System.Runtime.Intrinsics.Vector512`1[short]>
;* V05 tmp2 [V05 ] ( 0, 0 ) struct (64) zero-ref "impAppendStmt" <System.Runtime.Intrinsics.Vector512`1[short]>
;* V06 tmp3 [V06 ] ( 0, 0 ) struct (64) zero-ref "spilled call-like call argument" <System.Runtime.Intrinsics.Vector512`1[short]>
; V07 tmp4 [V07 ] ( 3, 6 ) struct (64) [rbp-0x40] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[short]>
; V08 tmp5 [V08,T00] ( 6, 20.25) int -> rax "Inline stloc first use temp"
;* V09 tmp6 [V09 ] ( 0, 0 ) short -> zero-ref "Inline return value spill temp"
;* V10 tmp7 [V10 ] ( 0, 0 ) struct (64) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[short]>
; V11 tmp8 [V11,T05] ( 2, 4 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
; V12 tmp9 [V12,T06] ( 2, 4 ) simd32 -> mm1 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
; V13 tmp10 [V13,T04] ( 3, 6 ) simd32 -> mm2 "dup spill"
;* V14 tmp11 [V14 ] ( 0, 0 ) struct (64) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[short]>
;* V15 tmp12 [V15 ] ( 0, 0 ) struct (64) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[short]>
; V16 tmp13 [V16,T07] ( 2, 4 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
; V17 tmp14 [V17,T08] ( 2, 4 ) simd32 -> mm1 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
; V18 tmp15 [V18,T10] ( 2, 2 ) simd32 -> mm0 "field V04._lower (fldOffset=0x0)" P-INDEP
; V19 tmp16 [V19,T11] ( 2, 2 ) simd32 -> mm1 "field V04._upper (fldOffset=0x20)" P-INDEP
;* V20 tmp17 [V20 ] ( 0, 0 ) simd32 -> zero-ref "field V05._lower (fldOffset=0x0)" P-INDEP
;* V21 tmp18 [V21 ] ( 0, 0 ) simd32 -> zero-ref "field V05._upper (fldOffset=0x20)" P-INDEP
;* V22 tmp19 [V22 ] ( 0, 0 ) simd32 -> zero-ref "field V06._lower (fldOffset=0x0)" P-INDEP
;* V23 tmp20 [V23 ] ( 0, 0 ) simd32 -> zero-ref "field V06._upper (fldOffset=0x20)" P-INDEP
; V24 tmp21 [V24 ] ( 2, 5 ) simd32 -> [rbp-0x40] do-not-enreg[XS] addr-exposed "field V07._lower (fldOffset=0x0)" P-DEP
; V25 tmp22 [V25 ] ( 2, 5 ) simd32 -> [rbp-0x20] do-not-enreg[XS] addr-exposed "field V07._upper (fldOffset=0x20)" P-DEP
;* V26 tmp23 [V26 ] ( 0, 0 ) simd32 -> zero-ref "field V10._lower (fldOffset=0x0)" P-INDEP
;* V27 tmp24 [V27 ] ( 0, 0 ) simd32 -> zero-ref "field V10._upper (fldOffset=0x20)" P-INDEP
;* V28 tmp25 [V28 ] ( 0, 0 ) simd32 -> zero-ref "field V14._lower (fldOffset=0x0)" P-INDEP
;* V29 tmp26 [V29 ] ( 0, 0 ) simd32 -> zero-ref "field V14._upper (fldOffset=0x20)" P-INDEP
; V30 tmp27 [V30,T12] ( 2, 2 ) simd32 -> mm0 "field V15._lower (fldOffset=0x0)" P-INDEP
; V31 tmp28 [V31,T13] ( 2, 2 ) simd32 -> mm1 "field V15._upper (fldOffset=0x20)" P-INDEP
; V32 cse0 [V32,T09] ( 3, 3 ) simd32 -> mm2 "CSE #02: moderate"
;
; Lcl frame size = 64
G_M3741_IG01:
push rbp
sub rsp, 64
lea rbp, [rsp+0x40]
;; size=10 bbWeight=0.25 PerfScore 0.44
G_M3741_IG02:
xor eax, eax
align [0 bytes for IG03]
;; size=2 bbWeight=0.25 PerfScore 0.06
G_M3741_IG03:
lea rcx, [rbp-0x40]
movsxd r8, eax
mov word ptr [rcx+2*r8], ax
inc eax
cmp eax, 32
jl SHORT G_M3741_IG03
;; size=19 bbWeight=4 PerfScore 13.00
G_M3741_IG04:
vmovups ymm0, ymmword ptr [rbp-0x40]
vmovups ymm1, ymmword ptr [rbp-0x20]
- movsx rax, dx
- vmovd xmm2, eax
+ vmovd xmm2, edx
vpbroadcastw ymm2, ymm2
vpmullw ymm0, ymm2, ymm0
vpmullw ymm1, ymm2, ymm1
- movsx rax, si
- vmovd xmm2, eax
+ vmovd xmm2, esi
vpbroadcastw ymm2, ymm2
vpaddw ymm0, ymm2, ymm0
vpaddw ymm1, ymm2, ymm1
vmovups ymmword ptr [rdi], ymm0
vmovups ymmword ptr [rdi+0x20], ymm1
mov rax, rdi
- ;; size=64 bbWeight=1 PerfScore 29.42
+ ;; size=56 bbWeight=1 PerfScore 28.92
G_M3741_IG05:
vzeroupper
add rsp, 64
pop rbp
ret
;; size=9 bbWeight=1 PerfScore 2.75
-; Total bytes of code 104, prolog size 10, PerfScore 45.67, instruction count 30, allocated bytes for code 104 (MethodHash=ecfef162) for method System.Runtime.Intrinsics.Vector512:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector512`1[short] (FullOpts)
+; Total bytes of code 96, prolog size 10, PerfScore 45.17, instruction count 28, allocated bytes for code 96 (MethodHash=ecfef162) for method System.Runtime.Intrinsics.Vector512:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector512`1[short] (FullOpts)
; ============================================================
-7 (-6.09 % of base) - System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte]
; Assembly listing for method System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 4 single block inlinees; 2 inlinees without PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T02] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T03] ( 3, 3 ) ubyte -> rsi single-def
; V02 arg1 [V02,T04] ( 3, 2.25) ubyte -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V04 tmp1 [V04 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V05 tmp2 [V05 ] ( 2, 5 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[ubyte]>
; V06 tmp3 [V06,T00] ( 5, 16.25) int -> rax "Inline stloc first use temp"
; V07 tmp4 [V07 ] ( 2, 8.50) simd32 -> [rbp-0x50] do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
; V08 tmp5 [V08 ] ( 2, 8.50) simd32 -> [rbp-0x70] do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
;* V09 tmp6 [V09 ] ( 0, 0 ) int -> zero-ref "impAppendStmt"
;* V10 tmp7 [V10 ] ( 0, 0 ) ubyte -> zero-ref "Inline stloc first use temp"
;* V11 tmp8 [V11 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V12 tmp9 [V12 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V13 tmp10 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
; V14 cse0 [V14,T01] ( 4, 16 ) long -> rdx "CSE #01: aggressive"
;
; Lcl frame size = 112
G_M16765_IG01:
push rbp
sub rsp, 112
lea rbp, [rsp+0x70]
;; size=10 bbWeight=0.25 PerfScore 0.44
G_M16765_IG02:
vmovups ymm0, ymmword ptr [reloc @RWD00]
vmovups ymmword ptr [rbp-0x50], ymm0
- movzx rax, dl
- vmovd xmm0, eax
+ vmovd xmm0, edx
vpbroadcastb ymm0, ymm0
vmovups ymmword ptr [rbp-0x70], ymm0
xor eax, eax
align [0 bytes for IG03]
- ;; size=32 bbWeight=0.25 PerfScore 2.62
+ ;; size=29 bbWeight=0.25 PerfScore 2.56
G_M16765_IG03:
lea rcx, [rbp-0x50]
movsxd rdx, eax
movzx rcx, byte ptr [rcx+rdx]
lea r8, [rbp-0x70]
movzx r8, byte ptr [r8+rdx]
imul ecx, r8d
lea r8, [rbp-0x30]
mov byte ptr [r8+rdx], cl
inc eax
cmp eax, 32
jl SHORT G_M16765_IG03
;; size=39 bbWeight=4 PerfScore 41.00
G_M16765_IG04:
- movzx rax, sil
- vmovd xmm0, eax
+ vmovd xmm0, esi
vpbroadcastb ymm0, ymm0
vpaddb ymm0, ymm0, ymmword ptr [rbp-0x30]
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=25 bbWeight=1 PerfScore 8.50
+ ;; size=21 bbWeight=1 PerfScore 8.25
G_M16765_IG05:
vzeroupper
add rsp, 112
pop rbp
ret
;; size=9 bbWeight=1 PerfScore 2.75
RWD00 dq 0706050403020100h, 0F0E0D0C0B0A0908h, 1716151413121110h, 1F1E1D1C1B1A1918h
-; Total bytes of code 115, prolog size 10, PerfScore 55.31, instruction count 32, allocated bytes for code 115 (MethodHash=2b2fbe82) for method System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
+; Total bytes of code 108, prolog size 10, PerfScore 55.00, instruction count 30, allocated bytes for code 108 (MethodHash=2b2fbe82) for method System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
; ============================================================
-7 (-9.21 % of base) - System.Runtime.Intrinsics.Vector128:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector128`1[ubyte]
; Assembly listing for method System.Runtime.Intrinsics.Vector128:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector128`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 3, 3 ) ubyte -> rsi single-def
; V02 arg1 [V02,T02] ( 3, 3 ) ubyte -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V04 tmp1 [V04,T03] ( 3, 6 ) simd32 -> mm0 "fgMakeTemp is creating a new local variable"
;
; Lcl frame size = 0
G_M32957_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M32957_IG02:
vpmovzxbw ymm0, xmmword ptr [reloc @RWD00]
- movzx rax, dl
- vmovd xmm1, eax
+ vmovd xmm1, edx
vpbroadcastb xmm1, xmm1
vpmovzxbw ymm1, ymm1
vpmullw ymm0, ymm1, ymm0
vpand ymm0, ymm0, ymmword ptr [reloc @RWD32]
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, -40
- movzx rax, sil
- vmovd xmm1, eax
+ vmovd xmm1, esi
vpbroadcastb xmm1, xmm1
vpaddb xmm0, xmm1, xmm0
vmovups xmmword ptr [rdi], xmm0
mov rax, rdi
- ;; size=72 bbWeight=1 PerfScore 24.08
+ ;; size=65 bbWeight=1 PerfScore 23.58
G_M32957_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
RWD00 dq 0706050403020100h, 0F0E0D0C0B0A0908h
RWD16 dd 00000000h, 00000000h, 00000000h, 00000000h
RWD32 dq 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh
-; Total bytes of code 76, prolog size 0, PerfScore 26.08, instruction count 17, allocated bytes for code 76 (MethodHash=ce707f42) for method System.Runtime.Intrinsics.Vector128:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector128`1[ubyte] (FullOpts)
+; Total bytes of code 69, prolog size 0, PerfScore 25.58, instruction count 15, allocated bytes for code 69 (MethodHash=ce707f42) for method System.Runtime.Intrinsics.Vector128:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector128`1[ubyte] (FullOpts)
; ============================================================
-7 (-6.25 % of base) - System.Runtime.Intrinsics.Vector256:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector256`1[ubyte]
; Assembly listing for method System.Runtime.Intrinsics.Vector256:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 3, 3 ) ubyte -> rsi single-def
; V02 arg1 [V02,T02] ( 3, 3 ) ubyte -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V04 tmp1 [V04,T03] ( 3, 6 ) simd32 -> mm1 "fgMakeTemp is creating a new local variable"
; V05 cse0 [V05,T04] ( 3, 3 ) simd32 -> mm2 "CSE #01: aggressive"
;
; Lcl frame size = 0
G_M8317_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M8317_IG02:
vpmovzxbw ymm0, xmmword ptr [reloc @RWD00]
- movzx rax, dl
- vmovd xmm1, eax
+ vmovd xmm1, edx
vpbroadcastb ymm1, ymm1
vmovaps ymm2, ymm1
vpmovzxbw ymm2, ymm2
vpmullw ymm0, ymm2, ymm0
vmovups ymm2, ymmword ptr [reloc @RWD32]
vpand ymm0, ymm0, ymm2
vpmovzxbw ymm3, xmmword ptr [reloc @RWD64]
vextracti128 xmm1, ymm1, 1
vpmovzxbw ymm1, ymm1
vpmullw ymm1, ymm1, ymm3
vpand ymm1, ymm1, ymm2
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, -40
- movzx rax, sil
- vmovd xmm1, eax
+ vmovd xmm1, esi
vpbroadcastb ymm1, ymm1
vpaddb ymm0, ymm1, ymm0
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=108 bbWeight=1 PerfScore 41.00
+ ;; size=101 bbWeight=1 PerfScore 40.50
G_M8317_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
RWD00 dq 0706050403020100h, 0F0E0D0C0B0A0908h
RWD16 dd 00000000h, 00000000h, 00000000h, 00000000h
RWD32 dq 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh
RWD64 dq 1716151413121110h, 1F1E1D1C1B1A1918h
-; Total bytes of code 112, prolog size 0, PerfScore 43.00, instruction count 24, allocated bytes for code 112 (MethodHash=ef88df82) for method System.Runtime.Intrinsics.Vector256:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
+; Total bytes of code 105, prolog size 0, PerfScore 42.50, instruction count 22, allocated bytes for code 105 (MethodHash=ef88df82) for method System.Runtime.Intrinsics.Vector256:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
; ============================================================
-7 (-3.14 % of base) - System.Runtime.Intrinsics.Vector512:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector512`1[ubyte]
; Assembly listing for method System.Runtime.Intrinsics.Vector512:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 14 single block inlinees; 7 inlinees without PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T01] ( 5, 5 ) byref -> rdi single-def
; V01 arg0 [V01,T02] ( 3, 3 ) ubyte -> rsi single-def
; V02 arg1 [V02,T03] ( 3, 3 ) ubyte -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V04 tmp1 [V04 ] ( 0, 0 ) struct (64) zero-ref "spilled call-like call argument" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V05 tmp2 [V05 ] ( 0, 0 ) struct (64) zero-ref "impAppendStmt" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V06 tmp3 [V06 ] ( 0, 0 ) struct (64) zero-ref "spilled call-like call argument" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V07 tmp4 [V07 ] ( 3, 6 ) struct (64) [rbp-0x40] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V08 tmp5 [V08,T00] ( 6, 20.25) int -> rax "Inline stloc first use temp"
;* V09 tmp6 [V09 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V10 tmp7 [V10 ] ( 0, 0 ) simd32 -> zero-ref "fgMakeTemp is creating a new local variable"
; V11 tmp8 [V11,T04] ( 3, 6 ) simd32 -> mm4 "fgMakeTemp is creating a new local variable"
;* V12 tmp9 [V12 ] ( 0, 0 ) simd32 -> zero-ref "fgMakeTemp is creating a new local variable"
; V13 tmp10 [V13,T07] ( 2, 4 ) simd32 -> mm3 "fgMakeTemp is creating a new local variable"
;* V14 tmp11 [V14 ] ( 0, 0 ) struct (64) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V15 tmp12 [V15,T08] ( 2, 4 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V16 tmp13 [V16,T09] ( 2, 4 ) simd32 -> mm1 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V17 tmp14 [V17,T05] ( 3, 6 ) simd32 -> mm2 "dup spill"
;* V18 tmp15 [V18 ] ( 0, 0 ) struct (64) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V19 tmp16 [V19 ] ( 0, 0 ) struct (64) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V20 tmp17 [V20,T10] ( 2, 4 ) simd32 -> mm0 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V21 tmp18 [V21,T11] ( 2, 4 ) simd32 -> mm1 "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V22 tmp19 [V22,T12] ( 3, 3 ) simd32 -> mm0 "field V04._lower (fldOffset=0x0)" P-INDEP
; V23 tmp20 [V23,T13] ( 3, 3 ) simd32 -> mm1 "field V04._upper (fldOffset=0x20)" P-INDEP
;* V24 tmp21 [V24 ] ( 0, 0 ) simd32 -> zero-ref "field V05._lower (fldOffset=0x0)" P-INDEP
;* V25 tmp22 [V25 ] ( 0, 0 ) simd32 -> zero-ref "field V05._upper (fldOffset=0x20)" P-INDEP
;* V26 tmp23 [V26 ] ( 0, 0 ) simd32 -> zero-ref "field V06._lower (fldOffset=0x0)" P-INDEP
;* V27 tmp24 [V27 ] ( 0, 0 ) simd32 -> zero-ref "field V06._upper (fldOffset=0x20)" P-INDEP
; V28 tmp25 [V28 ] ( 2, 5 ) simd32 -> [rbp-0x40] do-not-enreg[XS] addr-exposed "field V07._lower (fldOffset=0x0)" P-DEP
; V29 tmp26 [V29 ] ( 2, 5 ) simd32 -> [rbp-0x20] do-not-enreg[XS] addr-exposed "field V07._upper (fldOffset=0x20)" P-DEP
;* V30 tmp27 [V30 ] ( 0, 0 ) simd32 -> zero-ref "field V14._lower (fldOffset=0x0)" P-INDEP
;* V31 tmp28 [V31 ] ( 0, 0 ) simd32 -> zero-ref "field V14._upper (fldOffset=0x20)" P-INDEP
;* V32 tmp29 [V32 ] ( 0, 0 ) simd32 -> zero-ref "field V18._lower (fldOffset=0x0)" P-INDEP
;* V33 tmp30 [V33 ] ( 0, 0 ) simd32 -> zero-ref "field V18._upper (fldOffset=0x20)" P-INDEP
; V34 tmp31 [V34,T16] ( 2, 2 ) simd32 -> mm0 "field V19._lower (fldOffset=0x0)" P-INDEP
; V35 tmp32 [V35,T17] ( 2, 2 ) simd32 -> mm1 "field V19._upper (fldOffset=0x20)" P-INDEP
; V36 cse0 [V36,T14] ( 3, 3 ) simd32 -> mm3 "CSE #03: moderate"
; V37 cse1 [V37,T06] ( 5, 5 ) simd32 -> mm5 "CSE #01: moderate"
; V38 cse2 [V38,T15] ( 3, 3 ) simd32 -> mm4 "CSE #05: moderate"
;
; Lcl frame size = 64
G_M18333_IG01:
push rbp
sub rsp, 64
lea rbp, [rsp+0x40]
;; size=10 bbWeight=0.25 PerfScore 0.44
G_M18333_IG02:
xor eax, eax
align [0 bytes for IG03]
;; size=2 bbWeight=0.25 PerfScore 0.06
G_M18333_IG03:
lea rcx, [rbp-0x40]
movsxd r8, eax
mov byte ptr [rcx+r8], al
inc eax
cmp eax, 64
jl SHORT G_M18333_IG03
;; size=18 bbWeight=4 PerfScore 13.00
G_M18333_IG04:
vmovups ymm0, ymmword ptr [rbp-0x40]
vmovups ymm1, ymmword ptr [rbp-0x20]
vmovaps ymm2, ymm0
vpmovzxbw ymm2, ymm2
- movzx rax, dl
- vmovd xmm3, eax
+ vmovd xmm3, edx
vpbroadcastb ymm3, ymm3
vmovaps ymm4, ymm3
vmovaps ymm5, ymm4
vpmovzxbw ymm5, ymm5
vpmullw ymm2, ymm5, ymm2
vmovups ymm5, ymmword ptr [reloc @RWD00]
vpand ymm2, ymm2, ymm5
vextracti128 xmm0, ymm0, 1
vpmovzxbw ymm0, ymm0
vextracti128 xmm4, ymm4, 1
vpmovzxbw ymm4, ymm4
vpmullw ymm0, ymm0, ymm4
vpand ymm0, ymm0, ymm5
vpackuswb ymm0, ymm2, ymm0
vpermq ymm0, ymm0, -40
vmovaps ymm2, ymm1
vpmovzxbw ymm2, ymm2
vpmovzxbw ymm3, ymm3
vpmullw ymm2, ymm3, ymm2
vpand ymm2, ymm2, ymm5
vextracti128 xmm1, ymm1, 1
vpmovzxbw ymm1, ymm1
vpmullw ymm1, ymm1, ymm4
vpand ymm1, ymm1, ymm5
vpackuswb ymm1, ymm2, ymm1
vpermq ymm1, ymm1, -40
- movzx rax, sil
- vmovd xmm2, eax
+ vmovd xmm2, esi
vpbroadcastb ymm2, ymm2
vpaddb ymm0, ymm2, ymm0
vpaddb ymm1, ymm2, ymm1
vmovups ymmword ptr [rdi], ymm0
vmovups ymmword ptr [rdi+0x20], ymm1
mov rax, rdi
- ;; size=184 bbWeight=1 PerfScore 69.75
+ ;; size=177 bbWeight=1 PerfScore 69.25
G_M18333_IG05:
vzeroupper
add rsp, 64
pop rbp
ret
;; size=9 bbWeight=1 PerfScore 2.75
RWD00 dq 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh
-; Total bytes of code 223, prolog size 10, PerfScore 86.00, instruction count 55, allocated bytes for code 223 (MethodHash=434cb862) for method System.Runtime.Intrinsics.Vector512:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
+; Total bytes of code 216, prolog size 10, PerfScore 85.50, instruction count 53, allocated bytes for code 216 (MethodHash=434cb862) for method System.Runtime.Intrinsics.Vector512:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
; ============================================================
-6 (-1.64 % of base) - System.PackedSpanHelpers:IndexOfAnyInRange[System.SpanHelpers+DontNegate`1[short]](byref,short,short,int):int
; Assembly listing for method System.PackedSpanHelpers:IndexOfAnyInRange[System.SpanHelpers+DontNegate`1[short]](byref,short,short,int):int (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 7 single block inlinees; 5 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 11, 10 ) byref -> rdi single-def
; V01 arg1 [V01,T04] ( 5, 3.50) short -> rsi single-def
; V02 arg2 [V02,T05] ( 5, 3.50) short -> rdx single-def
; V03 arg3 [V03,T03] ( 10, 10 ) int -> rcx single-def
; V04 loc0 [V04,T08] ( 2, 4.50) int -> rax
; V05 loc1 [V05,T09] ( 2, 4.50) int -> rdx
; V06 loc2 [V06,T01] ( 6, 17 ) int -> rsi
; V07 loc3 [V07,T00] ( 10, 22.50) byref -> rax
;* V08 loc4 [V08 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V09 loc5 [V09 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V10 loc6 [V10 ] ( 0, 0 ) byref -> zero-ref
;* V11 loc7 [V11 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V12 loc8 [V12 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V13 loc9 [V13 ] ( 0, 0 ) byref -> zero-ref
;* V14 loc10 [V14 ] ( 0, 0 ) byref -> zero-ref
;* V15 loc11 [V15 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[short]>
;* V16 loc12 [V16 ] ( 0, 0 ) struct (64) zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V17 loc13 [V17,T20] ( 3, 5 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V18 loc14 [V18,T21] ( 3, 5 ) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V19 loc15 [V19,T07] ( 2, 4.50) byref -> rdx single-def
;* V20 loc16 [V20 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
; V21 loc17 [V21,T19] ( 4, 12.50) simd32 -> mm2 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V22 loc18 [V22,T12] ( 5, 2.50) byref -> rcx single-def
;* V23 loc19 [V23 ] ( 0, 0 ) byref -> zero-ref single-def
;* V24 loc20 [V24 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[short]>
; V25 loc21 [V25,T24] ( 4, 2 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V26 loc22 [V26,T26] ( 2, 1 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V27 loc23 [V27,T27] ( 2, 1 ) simd16 -> mm1 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V28 loc24 [V28 ] ( 0, 0 ) byref -> zero-ref
;* V29 loc25 [V29 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
;* V30 loc26 [V30 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V31 loc27 [V31,T13] ( 5, 2.50) byref -> rax single-def
;* V32 loc28 [V32 ] ( 0, 0 ) byref -> zero-ref single-def
;* V33 loc29 [V33 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[short]>
; V34 loc30 [V34,T25] ( 4, 2 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;# V35 OutArgs [V35 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V36 tmp1 [V36,T16] ( 3, 1.50) byref -> rcx
+; V36 tmp1 [V36,T16] ( 3, 1.50) byref -> rdx
;* V37 tmp2 [V37 ] ( 0, 0 ) simd16 -> zero-ref "impAppendStmt"
;* V38 tmp3 [V38 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
; V39 tmp4 [V39,T22] ( 3, 3 ) simd16 -> mm0 "fgMakeTemp is creating a new local variable"
; V40 tmp5 [V40,T17] ( 3, 1.50) byref -> rax
;* V41 tmp6 [V41 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V42 tmp7 [V42 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V43 tmp8 [V43,T23] ( 3, 3 ) simd32 -> mm0 "fgMakeTemp is creating a new local variable"
;* V44 tmp9 [V44 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V45 tmp10 [V45 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V46 tmp11 [V46,T18] ( 3, 24 ) simd32 -> mm2 "fgMakeTemp is creating a new local variable"
;* V47 tmp12 [V47 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V48 tmp13 [V48 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V49 tmp14 [V49 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V50 tmp15 [V50 ] ( 0, 0 ) simd32 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V51 tmp16 [V51 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
; V52 tmp17 [V52,T14] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
; V53 tmp18 [V53,T10] ( 3, 3 ) byref -> rax "Inlining Arg"
;* V54 tmp19 [V54 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V55 tmp20 [V55,T15] ( 5, 2.50) int -> rdx "Inline stloc first use temp"
-; V56 tmp21 [V56,T11] ( 3, 3 ) byref -> rcx "Inlining Arg"
+; V55 tmp20 [V55,T15] ( 5, 2.50) int -> rcx "Inline stloc first use temp"
+; V56 tmp21 [V56,T11] ( 3, 3 ) byref -> rdx "Inlining Arg"
; V57 tmp22 [V57,T06] ( 5, 5 ) int -> rax "Single return block return value"
;* V58 tmp23 [V58 ] ( 0, 0 ) simd32 -> zero-ref "field V08._lower (fldOffset=0x0)" P-INDEP
;* V59 tmp24 [V59 ] ( 0, 0 ) simd32 -> zero-ref "field V08._upper (fldOffset=0x20)" P-INDEP
;* V60 tmp25 [V60 ] ( 0, 0 ) simd32 -> zero-ref "field V09._lower (fldOffset=0x0)" P-INDEP
;* V61 tmp26 [V61 ] ( 0, 0 ) simd32 -> zero-ref "field V09._upper (fldOffset=0x20)" P-INDEP
;* V62 tmp27 [V62 ] ( 0, 0 ) simd32 -> zero-ref "field V11._lower (fldOffset=0x0)" P-INDEP
;* V63 tmp28 [V63 ] ( 0, 0 ) simd32 -> zero-ref "field V11._upper (fldOffset=0x20)" P-INDEP
;* V64 tmp29 [V64 ] ( 0, 0 ) simd32 -> zero-ref "field V12._lower (fldOffset=0x0)" P-INDEP
;* V65 tmp30 [V65 ] ( 0, 0 ) simd32 -> zero-ref "field V12._upper (fldOffset=0x20)" P-INDEP
;* V66 tmp31 [V66 ] ( 0, 0 ) simd32 -> zero-ref "field V15._lower (fldOffset=0x0)" P-INDEP
;* V67 tmp32 [V67 ] ( 0, 0 ) simd32 -> zero-ref "field V15._upper (fldOffset=0x20)" P-INDEP
;* V68 tmp33 [V68 ] ( 0, 0 ) simd32 -> zero-ref "field V16._lower (fldOffset=0x0)" P-INDEP
;* V69 tmp34 [V69 ] ( 0, 0 ) simd32 -> zero-ref "field V16._upper (fldOffset=0x20)" P-INDEP
;
; Lcl frame size = 0
G_M62233_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M62233_IG02:
cmp ecx, 8
jl G_M62233_IG10
;; size=9 bbWeight=1 PerfScore 1.25
G_M62233_IG03:
mov rax, rdi
cmp ecx, 16
jg SHORT G_M62233_IG05
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb xmm0, xmm0
- movzx rdx, dl
vmovd xmm1, edx
vpbroadcastb xmm1, xmm1
add ecx, -8
- movsxd rax, ecx
- lea rax, bword ptr [rdi+2*rax]
+ movsxd rsi, ecx
+ lea rax, bword ptr [rdi+2*rsi]
cmp rdi, rax
- mov rcx, rdi
- cmova rcx, rax
- vmovups xmm2, xmmword ptr [rcx]
+ mov rdx, rdi
+ cmova rdx, rax
+ vmovups xmm2, xmmword ptr [rdx]
vpackuswb xmm2, xmm2, xmmword ptr [rax]
vpsubb xmm0, xmm2, xmm0
vpminub xmm1, xmm0, xmm1
vpcmpeqb xmm0, xmm1, xmm0
vptest xmm0, xmm0
je G_M62233_IG12
- vpmovmskb edx, xmm0
- tzcnt edx, edx
- cmp edx, 8
+ vpmovmskb ecx, xmm0
+ tzcnt ecx, ecx
+ cmp ecx, 8
jl SHORT G_M62233_IG04
- mov rcx, rax
- add edx, -8
- ;; size=103 bbWeight=0.50 PerfScore 13.92
+ mov rdx, rax
+ add ecx, -8
+ ;; size=96 bbWeight=0.50 PerfScore 13.67
G_M62233_IG04:
- sub rcx, rdi
- shr rcx, 1
- lea eax, [rcx+rdx]
+ sub rdx, rdi
+ shr rdx, 1
+ lea eax, [rdx+rcx]
jmp G_M62233_IG15
- align [0 bytes for IG06]
- ;; size=14 bbWeight=0.50 PerfScore 1.62
+ align [4 bytes for IG06]
+ ;; size=18 bbWeight=0.50 PerfScore 1.62
G_M62233_IG05:
- movzx rsi, sil
vmovd xmm0, esi
vpbroadcastb ymm0, ymm0
- movzx rdx, dl
vmovd xmm1, edx
vpbroadcastb ymm1, ymm1
cmp ecx, 32
jle SHORT G_M62233_IG07
- lea edx, [rcx-0x20]
- movsxd rdx, edx
+ lea esi, [rcx-0x20]
+ movsxd rdx, esi
lea rdx, bword ptr [rax+2*rdx]
- ;; size=40 bbWeight=0.50 PerfScore 5.50
+ ;; size=33 bbWeight=0.50 PerfScore 5.25
G_M62233_IG06:
vmovups ymm2, ymmword ptr [rax]
vpackuswb ymm2, ymm2, ymmword ptr [rax+0x20]
vpsubb ymm2, ymm2, ymm0
vpminub ymm3, ymm2, ymm1
vpcmpeqb ymm2, ymm3, ymm2
vptest ymm2, ymm2
jne SHORT G_M62233_IG09
add rax, 64
cmp rax, rdx
jb SHORT G_M62233_IG06
;; size=37 bbWeight=4 PerfScore 67.33
G_M62233_IG07:
add ecx, -16
movsxd rcx, ecx
lea rcx, bword ptr [rdi+2*rcx]
cmp rax, rcx
cmova rax, rcx
vmovups ymm2, ymmword ptr [rax]
vpackuswb ymm2, ymm2, ymmword ptr [rcx]
vpsubb ymm0, ymm2, ymm0
vpminub ymm1, ymm1, ymm0
vpcmpeqb ymm0, ymm1, ymm0
vptest ymm0, ymm0
je SHORT G_M62233_IG12
vpermq ymm0, ymm0, -40
vpmovmskb edx, ymm0
tzcnt edx, edx
cmp edx, 16
jl SHORT G_M62233_IG08
mov rax, rcx
add edx, -16
;; size=69 bbWeight=0.50 PerfScore 12.79
G_M62233_IG08:
sub rax, rdi
shr rax, 1
add eax, edx
jmp SHORT G_M62233_IG15
- align [0 bytes for IG11]
- ;; size=10 bbWeight=0.50 PerfScore 1.50
+ align [4 bytes for IG11]
+ ;; size=14 bbWeight=0.50 PerfScore 1.50
G_M62233_IG09:
sub rax, rdi
shr rax, 1
vpermq ymm0, ymm2, -40
vpmovmskb edi, ymm0
xor ecx, ecx
tzcnt ecx, edi
add eax, ecx
jmp SHORT G_M62233_IG15
;; size=26 bbWeight=0.50 PerfScore 5.12
G_M62233_IG10:
movsx rax, si
movsx rdx, dx
xor esi, esi
test ecx, ecx
jle SHORT G_M62233_IG12
;; size=14 bbWeight=0.50 PerfScore 1.00
G_M62233_IG11:
movsxd r8, esi
movsx r8, word ptr [rdi+2*r8]
sub r8d, eax
cmp r8d, edx
jbe SHORT G_M62233_IG14
inc esi
cmp esi, ecx
jl SHORT G_M62233_IG11
;; size=22 bbWeight=4 PerfScore 29.00
G_M62233_IG12:
mov eax, -1
;; size=5 bbWeight=0.50 PerfScore 0.12
G_M62233_IG13:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M62233_IG14:
mov eax, esi
;; size=2 bbWeight=0.50 PerfScore 0.12
G_M62233_IG15:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
-; Total bytes of code 365, prolog size 4, PerfScore 143.04, instruction count 111, allocated bytes for code 369 (MethodHash=1fbe0ce6) for method System.PackedSpanHelpers:IndexOfAnyInRange[System.SpanHelpers+DontNegate`1[short]](byref,short,short,int):int (FullOpts)
+; Total bytes of code 359, prolog size 4, PerfScore 142.54, instruction count 107, allocated bytes for code 359 (MethodHash=1fbe0ce6) for method System.PackedSpanHelpers:IndexOfAnyInRange[System.SpanHelpers+DontNegate`1[short]](byref,short,short,int):int (FullOpts)
; ============================================================
-5 (-2.31 % of base) - System.SpanHelpers:Fill[short](byref,ulong,short)
; Assembly listing for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 1 single block inlinees; 1 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T01] ( 18, 38 ) byref -> rdi single-def
; V01 arg1 [V01,T06] ( 10, 6.50) long -> rsi single-def
; V02 arg2 [V02,T02] ( 18, 38 ) short -> rdx single-def
; V03 loc0 [V03,T00] ( 23, 50 ) long -> rax
;* V04 loc1 [V04 ] ( 0, 0 ) short -> zero-ref ld-addr-op
; V05 loc2 [V05,T10] ( 5, 9.50) simd32 -> mm0 ld-addr-op <System.Numerics.Vector`1[ubyte]>
; V06 loc3 [V06,T05] ( 5, 9.50) byref -> rdi single-def
; V07 loc4 [V07,T09] ( 4, 2 ) long -> rax
; V08 loc5 [V08,T07] ( 2, 4.50) long -> rcx
; V09 loc6 [V09,T03] ( 7, 21 ) long -> rdx
;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V11 loc8 [V11,T08] ( 2, 4.50) long -> rcx
;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V13 tmp1 [V13,T04] ( 2, 16 ) long -> rax "dup spill"
;* V14 tmp2 [V14 ] ( 0, 0 ) simd32 -> zero-ref ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[ushort]>
;* V15 tmp3 [V15 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V16 tmp4 [V16 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
;
; Lcl frame size = 0
G_M24463_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M24463_IG02:
cmp rsi, 16
jae G_M24463_IG09
;; size=10 bbWeight=1 PerfScore 1.25
G_M24463_IG03:
xor eax, eax
cmp rsi, 8
jb SHORT G_M24463_IG05
mov rcx, rsi
and rcx, -8
align [3 bytes for IG04]
;; size=18 bbWeight=0.50 PerfScore 1.12
G_M24463_IG04:
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
mov word ptr [rdi+2*rax+0x04], dx
mov word ptr [rdi+2*rax+0x06], dx
mov word ptr [rdi+2*rax+0x08], dx
mov word ptr [rdi+2*rax+0x0A], dx
mov word ptr [rdi+2*rax+0x0C], dx
mov word ptr [rdi+2*rax+0x0E], dx
add rax, 8
cmp rax, rcx
jb SHORT G_M24463_IG04
;; size=48 bbWeight=4 PerfScore 38.00
G_M24463_IG05:
test sil, 4
je SHORT G_M24463_IG06
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
mov word ptr [rdi+2*rax+0x04], dx
mov word ptr [rdi+2*rax+0x06], dx
add rax, 4
;; size=29 bbWeight=0.50 PerfScore 2.75
G_M24463_IG06:
test sil, 2
je SHORT G_M24463_IG07
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
add rax, 2
;; size=19 bbWeight=0.50 PerfScore 1.75
G_M24463_IG07:
test sil, 1
je SHORT G_M24463_IG08
mov word ptr [rdi+2*rax], dx
;; size=10 bbWeight=0.50 PerfScore 1.12
G_M24463_IG08:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M24463_IG09:
- movzx rax, dx
- vmovd xmm0, eax
+ vmovd xmm0, edx
vpbroadcastw ymm0, ymm0
lea rax, [rsi+rsi]
mov rcx, rax
and rcx, -64
xor edx, edx
cmp rsi, 32
jb SHORT G_M24463_IG11
- align [2 bytes for IG10]
- ;; size=33 bbWeight=0.50 PerfScore 3.50
+ align [0 bytes for IG10]
+ ;; size=28 bbWeight=0.50 PerfScore 3.25
G_M24463_IG10:
vmovups ymmword ptr [rdi+rdx], ymm0
vmovups ymmword ptr [rdi+rdx+0x20], ymm0
add rdx, 64
cmp rdx, rcx
jb SHORT G_M24463_IG10
;; size=20 bbWeight=4 PerfScore 22.00
G_M24463_IG11:
test al, 32
je SHORT G_M24463_IG12
vmovups ymmword ptr [rdi+rdx], ymm0
;; size=9 bbWeight=0.50 PerfScore 1.62
G_M24463_IG12:
vmovups ymmword ptr [rdi+rax-0x20], ymm0
;; size=6 bbWeight=0.50 PerfScore 1.00
G_M24463_IG13:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
-; Total bytes of code 216, prolog size 4, PerfScore 77.88, instruction count 61, allocated bytes for code 216 (MethodHash=7507a070) for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts)
+; Total bytes of code 211, prolog size 4, PerfScore 77.62, instruction count 60, allocated bytes for code 211 (MethodHash=7507a070) for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts)
; ============================================================
-5 (-2.31 % of base) - System.SpanHelpers:Fill[ushort](byref,ulong,ushort)
; Assembly listing for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 1 single block inlinees; 1 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T01] ( 18, 38 ) byref -> rdi single-def
; V01 arg1 [V01,T06] ( 10, 6.50) long -> rsi single-def
; V02 arg2 [V02,T02] ( 18, 38 ) ushort -> rdx single-def
; V03 loc0 [V03,T00] ( 23, 50 ) long -> rax
;* V04 loc1 [V04 ] ( 0, 0 ) ushort -> zero-ref ld-addr-op
; V05 loc2 [V05,T10] ( 5, 9.50) simd32 -> mm0 ld-addr-op <System.Numerics.Vector`1[ubyte]>
; V06 loc3 [V06,T05] ( 5, 9.50) byref -> rdi single-def
; V07 loc4 [V07,T09] ( 4, 2 ) long -> rax
; V08 loc5 [V08,T07] ( 2, 4.50) long -> rcx
; V09 loc6 [V09,T03] ( 7, 21 ) long -> rdx
;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V11 loc8 [V11,T08] ( 2, 4.50) long -> rcx
;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V13 tmp1 [V13,T04] ( 2, 16 ) long -> rax "dup spill"
;* V14 tmp2 [V14 ] ( 0, 0 ) simd32 -> zero-ref ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[ushort]>
;* V15 tmp3 [V15 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V16 tmp4 [V16 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg"
;
; Lcl frame size = 0
G_M51983_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M51983_IG02:
cmp rsi, 16
jae G_M51983_IG09
;; size=10 bbWeight=1 PerfScore 1.25
G_M51983_IG03:
xor eax, eax
cmp rsi, 8
jb SHORT G_M51983_IG05
mov rcx, rsi
and rcx, -8
align [3 bytes for IG04]
;; size=18 bbWeight=0.50 PerfScore 1.12
G_M51983_IG04:
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
mov word ptr [rdi+2*rax+0x04], dx
mov word ptr [rdi+2*rax+0x06], dx
mov word ptr [rdi+2*rax+0x08], dx
mov word ptr [rdi+2*rax+0x0A], dx
mov word ptr [rdi+2*rax+0x0C], dx
mov word ptr [rdi+2*rax+0x0E], dx
add rax, 8
cmp rax, rcx
jb SHORT G_M51983_IG04
;; size=48 bbWeight=4 PerfScore 38.00
G_M51983_IG05:
test sil, 4
je SHORT G_M51983_IG06
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
mov word ptr [rdi+2*rax+0x04], dx
mov word ptr [rdi+2*rax+0x06], dx
add rax, 4
;; size=29 bbWeight=0.50 PerfScore 2.75
G_M51983_IG06:
test sil, 2
je SHORT G_M51983_IG07
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
add rax, 2
;; size=19 bbWeight=0.50 PerfScore 1.75
G_M51983_IG07:
test sil, 1
je SHORT G_M51983_IG08
mov word ptr [rdi+2*rax], dx
;; size=10 bbWeight=0.50 PerfScore 1.12
G_M51983_IG08:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
G_M51983_IG09:
- movzx rax, dx
- vmovd xmm0, eax
+ vmovd xmm0, edx
vpbroadcastw ymm0, ymm0
lea rax, [rsi+rsi]
mov rcx, rax
and rcx, -64
xor edx, edx
cmp rsi, 32
jb SHORT G_M51983_IG11
- align [2 bytes for IG10]
- ;; size=33 bbWeight=0.50 PerfScore 3.50
+ align [0 bytes for IG10]
+ ;; size=28 bbWeight=0.50 PerfScore 3.25
G_M51983_IG10:
vmovups ymmword ptr [rdi+rdx], ymm0
vmovups ymmword ptr [rdi+rdx+0x20], ymm0
add rdx, 64
cmp rdx, rcx
jb SHORT G_M51983_IG10
;; size=20 bbWeight=4 PerfScore 22.00
G_M51983_IG11:
test al, 32
je SHORT G_M51983_IG12
vmovups ymmword ptr [rdi+rdx], ymm0
;; size=9 bbWeight=0.50 PerfScore 1.62
G_M51983_IG12:
vmovups ymmword ptr [rdi+rax-0x20], ymm0
;; size=6 bbWeight=0.50 PerfScore 1.00
G_M51983_IG13:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=0.50 PerfScore 1.25
-; Total bytes of code 216, prolog size 4, PerfScore 77.88, instruction count 61, allocated bytes for code 216 (MethodHash=267434f0) for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts)
+; Total bytes of code 211, prolog size 4, PerfScore 77.62, instruction count 60, allocated bytes for code 211 (MethodHash=267434f0) for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts)
; ============================================================
-4 (-13.33 % of base) - System.Numerics.Vector:Multiply[short](short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short]
; Assembly listing for method System.Numerics.Vector:Multiply[short](short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 3, 3 ) short -> rsi single-def
; V02 arg1 [V02,T02] ( 1, 1 ) simd32 -> [rsp+0x08] single-def <System.Numerics.Vector`1[short]>
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M33721_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M33721_IG02:
- movsx rax, si
- vmovd xmm0, eax
+ vmovd xmm0, esi
vpbroadcastw ymm0, ymm0
vpmullw ymm0, ymm0, ymmword ptr [rsp+0x08]
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=26 bbWeight=1 PerfScore 12.50
+ ;; size=22 bbWeight=1 PerfScore 12.25
G_M33721_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-; Total bytes of code 30, prolog size 0, PerfScore 14.50, instruction count 8, allocated bytes for code 30 (MethodHash=97457c46) for method System.Numerics.Vector:Multiply[short](short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 14.25, instruction count 7, allocated bytes for code 26 (MethodHash=97457c46) for method System.Numerics.Vector:Multiply[short](short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
; ============================================================
-4 (-13.33 % of base) - System.Numerics.Vector:Multiply[short](System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short]
; Assembly listing for method System.Numerics.Vector:Multiply[short](System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T02] ( 1, 1 ) simd32 -> [rsp+0x08] single-def <System.Numerics.Vector`1[short]>
; V02 arg1 [V02,T01] ( 3, 3 ) short -> rsi single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M16569_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M16569_IG02:
- movsx rax, si
- vmovd xmm0, eax
+ vmovd xmm0, esi
vpbroadcastw ymm0, ymm0
vpmullw ymm0, ymm0, ymmword ptr [rsp+0x08]
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=26 bbWeight=1 PerfScore 12.50
+ ;; size=22 bbWeight=1 PerfScore 12.25
G_M16569_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-; Total bytes of code 30, prolog size 0, PerfScore 14.50, instruction count 8, allocated bytes for code 30 (MethodHash=2ef4bf46) for method System.Numerics.Vector:Multiply[short](System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 14.25, instruction count 7, allocated bytes for code 26 (MethodHash=2ef4bf46) for method System.Numerics.Vector:Multiply[short](System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
; ============================================================
-4 (-4.08 % of base) - System.Numerics.Vector:Multiply[ubyte](System.Numerics.Vector`1[ubyte],ubyte):System.Numerics.Vector`1[ubyte]
; Assembly listing for method System.Numerics.Vector:Multiply[ubyte](System.Numerics.Vector`1[ubyte],ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 4 single block inlinees; 2 inlinees without PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T02] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T04] ( 1, 0.25) simd32 -> [rbp+0x10] single-def <System.Numerics.Vector`1[ubyte]>
; V02 arg1 [V02,T03] ( 3, 2.25) ubyte -> rsi single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V04 tmp1 [V04 ] ( 2, 5 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[ubyte]>
; V05 tmp2 [V05,T00] ( 5, 16.25) int -> rax "Inline stloc first use temp"
; V06 tmp3 [V06 ] ( 2, 8.50) simd32 -> [rbp-0x50] do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
; V07 tmp4 [V07 ] ( 2, 8.50) simd32 -> [rbp-0x70] do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
;* V08 tmp5 [V08 ] ( 0, 0 ) int -> zero-ref "impAppendStmt"
;* V09 tmp6 [V09 ] ( 0, 0 ) ubyte -> zero-ref "Inline stloc first use temp"
;* V10 tmp7 [V10 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V11 tmp8 [V11 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V12 tmp9 [V12 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
; V13 cse0 [V13,T01] ( 4, 16 ) long -> rdx "CSE #01: aggressive"
;
; Lcl frame size = 112
G_M18297_IG01:
push rbp
sub rsp, 112
lea rbp, [rsp+0x70]
;; size=10 bbWeight=0.25 PerfScore 0.44
G_M18297_IG02:
vmovups ymm0, ymmword ptr [rbp+0x10]
vmovups ymmword ptr [rbp-0x50], ymm0
- movzx rax, sil
- vmovd xmm0, eax
+ vmovd xmm0, esi
vpbroadcastb ymm0, ymm0
vmovups ymmword ptr [rbp-0x70], ymm0
xor eax, eax
align [0 bytes for IG03]
- ;; size=30 bbWeight=0.25 PerfScore 2.62
+ ;; size=26 bbWeight=0.25 PerfScore 2.56
G_M18297_IG03:
lea rcx, [rbp-0x50]
movsxd rdx, eax
movzx rcx, byte ptr [rcx+rdx]
lea rsi, [rbp-0x70]
movzx rsi, byte ptr [rsi+rdx]
imul ecx, esi
lea rsi, [rbp-0x30]
mov byte ptr [rsi+rdx], cl
inc eax
cmp eax, 32
jl SHORT G_M18297_IG03
;; size=37 bbWeight=4 PerfScore 41.00
G_M18297_IG04:
vmovups ymm0, ymmword ptr [rbp-0x30]
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
;; size=12 bbWeight=1 PerfScore 6.25
G_M18297_IG05:
vzeroupper
add rsp, 112
pop rbp
ret
;; size=9 bbWeight=1 PerfScore 2.75
-; Total bytes of code 98, prolog size 10, PerfScore 53.06, instruction count 29, allocated bytes for code 98 (MethodHash=9485b886) for method System.Numerics.Vector:Multiply[ubyte](System.Numerics.Vector`1[ubyte],ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
+; Total bytes of code 94, prolog size 10, PerfScore 53.00, instruction count 28, allocated bytes for code 94 (MethodHash=9485b886) for method System.Numerics.Vector:Multiply[ubyte](System.Numerics.Vector`1[ubyte],ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
; ============================================================
-4 (-4.08 % of base) - System.Numerics.Vector:Multiply[ubyte](ubyte,System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte]
; Assembly listing for method System.Numerics.Vector:Multiply[ubyte](ubyte,System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 4 single block inlinees; 2 inlinees without PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T02] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T03] ( 3, 2.25) ubyte -> rsi single-def
; V02 arg1 [V02,T04] ( 1, 0.25) simd32 -> [rbp+0x10] single-def <System.Numerics.Vector`1[ubyte]>
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V04 tmp1 [V04 ] ( 2, 5 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[ubyte]>
; V05 tmp2 [V05,T00] ( 5, 16.25) int -> rax "Inline stloc first use temp"
; V06 tmp3 [V06 ] ( 2, 8.50) simd32 -> [rbp-0x50] do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
; V07 tmp4 [V07 ] ( 2, 8.50) simd32 -> [rbp-0x70] do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
;* V08 tmp5 [V08 ] ( 0, 0 ) int -> zero-ref "impAppendStmt"
;* V09 tmp6 [V09 ] ( 0, 0 ) ubyte -> zero-ref "Inline stloc first use temp"
;* V10 tmp7 [V10 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
;* V11 tmp8 [V11 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V12 tmp9 [V12 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
; V13 cse0 [V13,T01] ( 4, 16 ) long -> rdx "CSE #01: aggressive"
;
; Lcl frame size = 112
G_M24697_IG01:
push rbp
sub rsp, 112
lea rbp, [rsp+0x70]
;; size=10 bbWeight=0.25 PerfScore 0.44
G_M24697_IG02:
vmovups ymm0, ymmword ptr [rbp+0x10]
vmovups ymmword ptr [rbp-0x50], ymm0
- movzx rax, sil
- vmovd xmm0, eax
+ vmovd xmm0, esi
vpbroadcastb ymm0, ymm0
vmovups ymmword ptr [rbp-0x70], ymm0
xor eax, eax
align [0 bytes for IG03]
- ;; size=30 bbWeight=0.25 PerfScore 2.62
+ ;; size=26 bbWeight=0.25 PerfScore 2.56
G_M24697_IG03:
lea rcx, [rbp-0x50]
movsxd rdx, eax
movzx rcx, byte ptr [rcx+rdx]
lea rsi, [rbp-0x70]
movzx rsi, byte ptr [rsi+rdx]
imul ecx, esi
lea rsi, [rbp-0x30]
mov byte ptr [rsi+rdx], cl
inc eax
cmp eax, 32
jl SHORT G_M24697_IG03
;; size=37 bbWeight=4 PerfScore 41.00
G_M24697_IG04:
vmovups ymm0, ymmword ptr [rbp-0x30]
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
;; size=12 bbWeight=1 PerfScore 6.25
G_M24697_IG05:
vzeroupper
add rsp, 112
pop rbp
ret
;; size=9 bbWeight=1 PerfScore 2.75
-; Total bytes of code 98, prolog size 10, PerfScore 53.06, instruction count 29, allocated bytes for code 98 (MethodHash=1e1e9f86) for method System.Numerics.Vector:Multiply[ubyte](ubyte,System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
+; Total bytes of code 94, prolog size 10, PerfScore 53.00, instruction count 28, allocated bytes for code 94 (MethodHash=1e1e9f86) for method System.Numerics.Vector:Multiply[ubyte](ubyte,System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
; ============================================================
-4 (-19.05 % of base) - System.Numerics.Vector`1[short]:.ctor(short):this
; Assembly listing for method System.Numerics.Vector`1[short]:.ctor(short):this (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 this [V00,T00] ( 3, 3 ) byref -> rdi this single-def
; V01 arg1 [V01,T01] ( 3, 3 ) short -> rsi single-def
;# V02 OutArgs [V02 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M25674_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M25674_IG02:
- movsx rax, si
- vmovd xmm0, eax
+ vmovd xmm0, esi
vpbroadcastw ymm0, ymm0
vmovups ymmword ptr [rdi], ymm0
- ;; size=17 bbWeight=1 PerfScore 6.25
+ ;; size=13 bbWeight=1 PerfScore 6.00
G_M25674_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-; Total bytes of code 21, prolog size 0, PerfScore 8.25, instruction count 6, allocated bytes for code 21 (MethodHash=d18e9bb5) for method System.Numerics.Vector`1[short]:.ctor(short):this (FullOpts)
+; Total bytes of code 17, prolog size 0, PerfScore 8.00, instruction count 5, allocated bytes for code 17 (MethodHash=d18e9bb5) for method System.Numerics.Vector`1[short]:.ctor(short):this (FullOpts)
; ============================================================
-4 (-13.33 % of base) - System.Numerics.Vector`1[short]:op_Multiply(short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short]
; Assembly listing for method System.Numerics.Vector`1[short]:op_Multiply(short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 3, 3 ) short -> rsi single-def
; V02 arg1 [V02,T02] ( 1, 1 ) simd32 -> [rsp+0x08] single-def <System.Numerics.Vector`1[short]>
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M16008_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M16008_IG02:
- movsx rax, si
- vmovd xmm0, eax
+ vmovd xmm0, esi
vpbroadcastw ymm0, ymm0
vpmullw ymm0, ymm0, ymmword ptr [rsp+0x08]
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=26 bbWeight=1 PerfScore 12.50
+ ;; size=22 bbWeight=1 PerfScore 12.25
G_M16008_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-; Total bytes of code 30, prolog size 0, PerfScore 14.50, instruction count 8, allocated bytes for code 30 (MethodHash=8623c177) for method System.Numerics.Vector`1[short]:op_Multiply(short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 14.25, instruction count 7, allocated bytes for code 26 (MethodHash=8623c177) for method System.Numerics.Vector`1[short]:op_Multiply(short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
; ============================================================
-4 (-13.33 % of base) - System.Numerics.Vector`1[short]:op_Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short]
; Assembly listing for method System.Numerics.Vector`1[short]:op_Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T02] ( 1, 1 ) simd32 -> [rsp+0x08] single-def <System.Numerics.Vector`1[short]>
; V02 arg1 [V02,T01] ( 3, 3 ) short -> rsi single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M61576_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M61576_IG02:
- movsx rax, si
- vmovd xmm0, eax
+ vmovd xmm0, esi
vpbroadcastw ymm0, ymm0
vpmullw ymm0, ymm0, ymmword ptr [rsp+0x08]
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=26 bbWeight=1 PerfScore 12.50
+ ;; size=22 bbWeight=1 PerfScore 12.25
G_M61576_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-; Total bytes of code 30, prolog size 0, PerfScore 14.50, instruction count 8, allocated bytes for code 30 (MethodHash=45690f77) for method System.Numerics.Vector`1[short]:op_Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 14.25, instruction count 7, allocated bytes for code 26 (MethodHash=45690f77) for method System.Numerics.Vector`1[short]:op_Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
; ============================================================
-4 (-11.76 % of base) - System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector,T>.Any(System.Numerics.Vector`1[short],short):ubyte
; Assembly listing for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Any(System.Numerics.Vector`1[short],short):ubyte (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 1, 1 ) simd32 -> [rsp+0x08] single-def <System.Numerics.Vector`1[short]>
; V01 arg1 [V01,T00] ( 3, 3 ) short -> rdi single-def
;# V02 OutArgs [V02 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V03 rat0 [V03,T01] ( 3, 6 ) simd32 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M23858_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M23858_IG02:
- movsx rax, di
- vmovd xmm0, eax
+ vmovd xmm0, edi
vpbroadcastw ymm0, ymm0
vpcmpeqw ymm0, ymm0, ymmword ptr [rsp+0x08]
vptest ymm0, ymm0
setne al
movzx rax, al
- ;; size=30 bbWeight=1 PerfScore 12.50
+ ;; size=26 bbWeight=1 PerfScore 12.25
G_M23858_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-; Total bytes of code 34, prolog size 0, PerfScore 14.50, instruction count 9, allocated bytes for code 34 (MethodHash=a9bca2cd) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Any(System.Numerics.Vector`1[short],short):ubyte (FullOpts)
+; Total bytes of code 30, prolog size 0, PerfScore 14.25, instruction count 8, allocated bytes for code 30 (MethodHash=a9bca2cd) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Any(System.Numerics.Vector`1[short],short):ubyte (FullOpts)
; ============================================================
-4 (-16.67 % of base) - System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector,T>.Create(short):System.Numerics.Vector`1[short]
; Assembly listing for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Create(short):System.Numerics.Vector`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 3, 3 ) short -> rsi single-def
;# V02 OutArgs [V02 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M23411_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M23411_IG02:
- movsx rax, si
- vmovd xmm0, eax
+ vmovd xmm0, esi
vpbroadcastw ymm0, ymm0
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=20 bbWeight=1 PerfScore 6.50
+ ;; size=16 bbWeight=1 PerfScore 6.25
G_M23411_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-; Total bytes of code 24, prolog size 0, PerfScore 8.50, instruction count 7, allocated bytes for code 24 (MethodHash=99b6a48c) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Create(short):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 20, prolog size 0, PerfScore 8.25, instruction count 6, allocated bytes for code 20 (MethodHash=99b6a48c) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Create(short):System.Numerics.Vector`1[short] (FullOpts)
; ============================================================
-4 (-13.33 % of base) - System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector,T>.Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short]
; Assembly listing for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T02] ( 1, 1 ) simd32 -> [rsp+0x08] single-def <System.Numerics.Vector`1[short]>
; V02 arg1 [V02,T01] ( 3, 3 ) short -> rsi single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M5366_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M5366_IG02:
- movsx rax, si
- vmovd xmm0, eax
+ vmovd xmm0, esi
vpbroadcastw ymm0, ymm0
vpmullw ymm0, ymm0, ymmword ptr [rsp+0x08]
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=26 bbWeight=1 PerfScore 12.50
+ ;; size=22 bbWeight=1 PerfScore 12.25
G_M5366_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-; Total bytes of code 30, prolog size 0, PerfScore 14.50, instruction count 8, allocated bytes for code 30 (MethodHash=9611eb09) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 14.25, instruction count 7, allocated bytes for code 26 (MethodHash=9611eb09) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
; ============================================================