Skip to content

Instantly share code, notes, and snippets.

@MihuBot
Created October 31, 2024 15:23
Show Gist options
  • Save MihuBot/cac97d05b7a0f61201b054be0f3e61df to your computer and use it in GitHub Desktop.
Save MihuBot/cac97d05b7a0f61201b054be0f3e61df to your computer and use it in GitHub Desktop.

Top method improvements

-30 (-7.18 % of base) - System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int
 ; Assembly listing for method System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 19 single block inlinees; 5 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T02] ( 15, 12   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T06] (  6,  3.75)   short  ->  rsi         single-def
 ;  V02 arg2         [V02,T01] ( 15, 19.50)     int  ->  rdx        
 ;  V03 loc0         [V03,T03] (  6, 13.50)    long  ->  rcx        
 ;  V04 loc1         [V04,T00] ( 10, 22.50)   byref  ->  rax        
 ;* V05 loc2         [V05    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V06 loc3         [V06    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V07 loc4         [V07    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V08 loc5         [V08    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V09 loc6         [V09    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V10 loc7         [V10    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V11 loc8         [V11    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V12 loc9         [V12    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V13 loc10        [V13,T17] (  3,  5   )  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V14 loc11        [V14,T07] (  2,  4.50)   byref  ->  rcx         single-def
 ;* V15 loc12        [V15    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[short]>
 ;* V16 loc13        [V16    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V17 loc14        [V17,T16] (  4, 12.50)  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V18 loc15        [V18,T10] (  5,  2.50)   byref  ->  rcx         single-def
 ;* V19 loc16        [V19    ] (  0,  0   )   byref  ->  zero-ref    single-def
 ;* V20 loc17        [V20    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[short]>
 ;* V21 loc18        [V21    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V22 loc19        [V22,T18] (  4,  2   )  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V23 loc20        [V23,T20] (  2,  1   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V24 loc21        [V24    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V25 loc22        [V25    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V26 loc23        [V26    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V27 loc24        [V27    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V28 loc25        [V28,T11] (  5,  2.50)   byref  ->  rax         single-def
 ;* V29 loc26        [V29    ] (  0,  0   )   byref  ->  zero-ref    single-def
 ;* V30 loc27        [V30    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V31 loc28        [V31    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V32 loc29        [V32,T19] (  4,  2   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;# V33 OutArgs      [V33    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V34 tmp1         [V34,T14] (  3,  1.50)   byref  ->  rcx        
 ;* V35 tmp2         [V35    ] (  0,  0   )  simd16  ->  zero-ref    "impAppendStmt"
 ;* V36 tmp3         [V36    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
 ;  V37 tmp4         [V37,T15] (  3,  1.50)   byref  ->  rax        
 ;* V38 tmp5         [V38    ] (  0,  0   )  simd32  ->  zero-ref    "impAppendStmt"
 ;* V39 tmp6         [V39    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;* V40 tmp7         [V40    ] (  0,  0   )  simd32  ->  zero-ref    "impAppendStmt"
 ;* V41 tmp8         [V41    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;* V42 tmp9         [V42    ] (  0,  0   )   short  ->  zero-ref    "Inlining Arg"
 ;* V43 tmp10        [V43    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V44 tmp11        [V44    ] (  0,  0   )   short  ->  zero-ref    "Inlining Arg"
 ;* V45 tmp12        [V45    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V46 tmp13        [V46    ] (  0,  0   )   short  ->  zero-ref    "Inlining Arg"
 ;* V47 tmp14        [V47    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V48 tmp15        [V48    ] (  0,  0   )   short  ->  zero-ref    "Inlining Arg"
 ;* V49 tmp16        [V49    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V50 tmp17        [V50    ] (  0,  0   )   short  ->  zero-ref    "Inlining Arg"
 ;* V51 tmp18        [V51    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V52 tmp19        [V52    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V53 tmp20        [V53    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;* V54 tmp21        [V54    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V55 tmp22        [V55    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;  V56 tmp23        [V56,T12] (  5,  2.50)     int  ->  rdx         "Inline stloc first use temp"
 ;  V57 tmp24        [V57,T08] (  3,  3   )   byref  ->  rax         "Inlining Arg"
 ;* V58 tmp25        [V58    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V59 tmp26        [V59,T13] (  5,  2.50)     int  ->  rdx         "Inline stloc first use temp"
 ;  V60 tmp27        [V60,T09] (  3,  3   )   byref  ->  rcx         "Inlining Arg"
 ;  V61 tmp28        [V61,T04] (  7,  7   )     int  ->  rax         "Single return block return value"
 ;* V62 tmp29        [V62    ] (  0,  0   )  simd32  ->  zero-ref    "field V05._lower (fldOffset=0x0)" P-INDEP
 ;* V63 tmp30        [V63    ] (  0,  0   )  simd32  ->  zero-ref    "field V05._upper (fldOffset=0x20)" P-INDEP
 ;* V64 tmp31        [V64    ] (  0,  0   )  simd32  ->  zero-ref    "field V07._lower (fldOffset=0x0)" P-INDEP
 ;* V65 tmp32        [V65    ] (  0,  0   )  simd32  ->  zero-ref    "field V07._upper (fldOffset=0x20)" P-INDEP
 ;* V66 tmp33        [V66    ] (  0,  0   )  simd32  ->  zero-ref    "field V08._lower (fldOffset=0x0)" P-INDEP
 ;* V67 tmp34        [V67    ] (  0,  0   )  simd32  ->  zero-ref    "field V08._upper (fldOffset=0x20)" P-INDEP
 ;* V68 tmp35        [V68    ] (  0,  0   )  simd32  ->  zero-ref    "field V11._lower (fldOffset=0x0)" P-INDEP
 ;* V69 tmp36        [V69    ] (  0,  0   )  simd32  ->  zero-ref    "field V11._upper (fldOffset=0x20)" P-INDEP
 ;* V70 tmp37        [V70    ] (  0,  0   )  simd32  ->  zero-ref    "field V12._lower (fldOffset=0x0)" P-INDEP
 ;* V71 tmp38        [V71    ] (  0,  0   )  simd32  ->  zero-ref    "field V12._upper (fldOffset=0x20)" P-INDEP
 ;  V72 cse0         [V72,T05] (  7,  6.75)     int  ->  rax         hoist multi-def "CSE #01: aggressive"
 ;
 ; Lcl frame size = 0
 
 G_M26041_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M26041_IG02:
        cmp      edx, 8
        jl       G_M26041_IG10
 						;; size=9 bbWeight=1 PerfScore 1.25
 G_M26041_IG03:
        mov      rax, rdi
        cmp      edx, 16
        jg       SHORT G_M26041_IG05
-       movzx    rsi, sil
        vmovd    xmm0, esi
        vpbroadcastb xmm0, xmm0
        add      edx, -8
-       movsxd   rax, edx
-       lea      rax, bword ptr [rdi+2*rax]
+       movsxd   rsi, edx
+       lea      rax, bword ptr [rdi+2*rsi]
        cmp      rdi, rax
        mov      rcx, rdi
        cmova    rcx, rax
        vmovups  xmm1, xmmword ptr [rcx]
        vpackuswb xmm1, xmm1, xmmword ptr [rax]
        vpcmpeqb xmm0, xmm1, xmm0
        vptest   xmm0, xmm0
        je       G_M26041_IG14
        vpmovmskb edx, xmm0
        tzcnt    edx, edx
        cmp      edx, 8
        jl       SHORT G_M26041_IG04
        mov      rcx, rax
        add      edx, -8
-						;; size=83 bbWeight=0.50 PerfScore 11.88
+						;; size=79 bbWeight=0.50 PerfScore 11.75
 G_M26041_IG04:
        sub      rcx, rdi
        shr      rcx, 1
        lea      eax, [rcx+rdx]
        jmp      G_M26041_IG17
-       align    [6 bytes for IG06]
-						;; size=20 bbWeight=0.50 PerfScore 1.62
+       align    [0 bytes for IG06]
+						;; size=14 bbWeight=0.50 PerfScore 1.62
 G_M26041_IG05:
-       movzx    rsi, sil
        vmovd    xmm0, esi
        vpbroadcastb ymm0, ymm0
        cmp      edx, 32
        jle      SHORT G_M26041_IG07
-       lea      ecx, [rdx-0x20]
-       movsxd   rcx, ecx
+       lea      esi, [rdx-0x20]
+       movsxd   rcx, esi
        lea      rcx, bword ptr [rax+2*rcx]
-						;; size=28 bbWeight=0.50 PerfScore 3.38
+						;; size=24 bbWeight=0.50 PerfScore 3.25
 G_M26041_IG06:
        vmovups  ymm1, ymmword ptr [rax]
        vpackuswb ymm1, ymm1, ymmword ptr [rax+0x20]
        vpcmpeqb ymm1, ymm1, ymm0
        vptest   ymm1, ymm1
        jne      SHORT G_M26041_IG09
        add      rax, 64
        cmp      rax, rcx
        jb       SHORT G_M26041_IG06
 						;; size=29 bbWeight=4 PerfScore 64.00
 G_M26041_IG07:
        add      edx, -16
        movsxd   rcx, edx
        lea      rcx, bword ptr [rdi+2*rcx]
        cmp      rax, rcx
        cmova    rax, rcx
        vmovups  ymm1, ymmword ptr [rax]
        vpackuswb ymm1, ymm1, ymmword ptr [rcx]
        vpcmpeqb ymm0, ymm1, ymm0
        vptest   ymm0, ymm0
        je       G_M26041_IG14
        vpermq   ymm0, ymm0, -40
        vpmovmskb edx, ymm0
        tzcnt    edx, edx
        cmp      edx, 16
        jl       SHORT G_M26041_IG08
        mov      rax, rcx
        add      edx, -16
 						;; size=65 bbWeight=0.50 PerfScore 12.38
 G_M26041_IG08:
        sub      rax, rdi
        shr      rax, 1
        add      eax, edx
-       jmp      G_M26041_IG17
-       align    [13 bytes for IG13]
-						;; size=26 bbWeight=0.50 PerfScore 1.50
+       jmp      SHORT G_M26041_IG17
+       align    [0 bytes for IG13]
+						;; size=10 bbWeight=0.50 PerfScore 1.50
 G_M26041_IG09:
        sub      rax, rdi
        shr      rax, 1
        vpermq   ymm0, ymm1, -40
        vpmovmskb edi, ymm0
        xor      ecx, ecx
        tzcnt    ecx, edi
        add      eax, ecx
        jmp      SHORT G_M26041_IG17
 						;; size=26 bbWeight=0.50 PerfScore 5.12
 G_M26041_IG10:
        xor      ecx, ecx
        cmp      edx, 4
        jl       SHORT G_M26041_IG11
        add      edx, -4
        movsx    rcx, word  ptr [rdi]
        movsx    rax, si
        cmp      ecx, eax
        je       SHORT G_M26041_IG22
        movsx    rcx, word  ptr [rdi+0x02]
        cmp      ecx, eax
        je       SHORT G_M26041_IG20
        movsx    rcx, word  ptr [rdi+0x04]
        cmp      ecx, eax
        je       SHORT G_M26041_IG18
        movsx    rcx, word  ptr [rdi+0x06]
        cmp      ecx, eax
        je       SHORT G_M26041_IG16
        mov      ecx, 4
 						;; size=54 bbWeight=0.50 PerfScore 11.62
 G_M26041_IG11:
        test     edx, edx
        jle      SHORT G_M26041_IG14
 						;; size=4 bbWeight=0.50 PerfScore 0.62
 G_M26041_IG12:
        movsx    rax, si
 						;; size=4 bbWeight=0.25 PerfScore 0.06
 G_M26041_IG13:
        dec      edx
        movsx    rsi, word  ptr [rdi+2*rcx]
        cmp      esi, eax
        je       SHORT G_M26041_IG15
        inc      rcx
        test     edx, edx
        jg       SHORT G_M26041_IG13
 						;; size=18 bbWeight=4 PerfScore 28.00
 G_M26041_IG14:
        mov      eax, -1
        jmp      SHORT G_M26041_IG17
 						;; size=7 bbWeight=0.50 PerfScore 1.12
 G_M26041_IG15:
        mov      eax, ecx
        jmp      SHORT G_M26041_IG17
 						;; size=4 bbWeight=0.50 PerfScore 1.12
 G_M26041_IG16:
        mov      eax, 3
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M26041_IG17:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M26041_IG18:
        mov      eax, 2
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M26041_IG19:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M26041_IG20:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M26041_IG21:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M26041_IG22:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M26041_IG23:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 
-; Total bytes of code 418, prolog size 4, PerfScore 150.44, instruction count 126, allocated bytes for code 418 (MethodHash=25ef9a46) for method System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int (FullOpts)
+; Total bytes of code 388, prolog size 4, PerfScore 150.19, instruction count 124, allocated bytes for code 391 (MethodHash=25ef9a46) for method System.PackedSpanHelpers:IndexOf[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,int):int (FullOpts)
 ; ============================================================
-15 (-2.19 % of base) - System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int
 ; Assembly listing for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 11 single block inlinees; 5 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T03] ( 15, 12   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T10] (  6,  3.75)   short  ->  rsi         single-def
 ;  V02 arg2         [V02,T05] (  9,  7   )   short  ->  rdx         single-def
 ;  V03 arg3         [V03,T06] (  9,  7   )   short  ->  rcx         single-def
 ;  V04 arg4         [V04,T01] ( 15, 19.50)     int  ->   r8        
 ;  V05 loc0         [V05,T04] (  6, 13.50)    long  ->   r9        
 ;  V06 loc1         [V06,T02] ( 20, 20   )   short  ->  rax        
 ;  V07 loc2         [V07,T00] ( 10, 22.50)   byref  ->  rax        
 ;* V08 loc3         [V08    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V09 loc4         [V09    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V10 loc5         [V10    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V11 loc6         [V11    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V12 loc7         [V12    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V13 loc8         [V13    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V14 loc9         [V14    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V15 loc10        [V15    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V16 loc11        [V16    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V17 loc12        [V17    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V18 loc13        [V18    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V19 loc14        [V19    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V20 loc15        [V20,T26] (  3,  5   )  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V21 loc16        [V21,T27] (  3,  5   )  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V22 loc17        [V22,T28] (  3,  5   )  simd32  ->  mm2         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V23 loc18        [V23,T11] (  2,  4.50)   byref  ->  rcx         single-def
 ;* V24 loc19        [V24    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V25 loc20        [V25,T24] (  4, 16   )  simd32  ->  mm3         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V26 loc21        [V26,T25] (  4, 12.50)  simd32  ->  mm3         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V27 loc22        [V27,T14] (  5,  2.50)   byref  ->  rcx         single-def
 ;* V28 loc23        [V28    ] (  0,  0   )   byref  ->  zero-ref    single-def
 ;* V29 loc24        [V29    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V30 loc25        [V30,T29] (  4,  2   )  simd32  ->  mm3         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V31 loc26        [V31,T30] (  4,  2   )  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V32 loc27        [V32,T33] (  2,  1   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V33 loc28        [V33,T34] (  2,  1   )  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V34 loc29        [V34,T35] (  2,  1   )  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V35 loc30        [V35    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V36 loc31        [V36    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V37 loc32        [V37    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V38 loc33        [V38    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V39 loc34        [V39,T15] (  5,  2.50)   byref  ->  rax         single-def
 ;* V40 loc35        [V40    ] (  0,  0   )   byref  ->  zero-ref    single-def
 ;* V41 loc36        [V41    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V42 loc37        [V42,T31] (  4,  2   )  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V43 loc38        [V43,T32] (  4,  2   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;# V44 OutArgs      [V44    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V45 tmp1         [V45,T18] (  3,  1.50)   byref  ->  rcx        
+;  V45 tmp1         [V45,T18] (  3,  1.50)   byref  ->  rdx        
 ;* V46 tmp2         [V46    ] (  0,  0   )  simd16  ->  zero-ref    "impAppendStmt"
 ;  V47 tmp3         [V47,T19] (  3,  1.50)   byref  ->  rax        
 ;* V48 tmp4         [V48    ] (  0,  0   )  simd32  ->  zero-ref    "impAppendStmt"
 ;* V49 tmp5         [V49    ] (  0,  0   )  simd32  ->  zero-ref    "impAppendStmt"
 ;  V50 tmp6         [V50,T09] (  2,  4   )     int  ->  rax        
 ;  V51 tmp7         [V51,T20] (  2,  1   )     int  ->  rax        
 ;  V52 tmp8         [V52,T21] (  2,  1   )     int  ->  rax        
 ;  V53 tmp9         [V53,T22] (  2,  1   )     int  ->  rax        
 ;  V54 tmp10        [V54,T23] (  2,  1   )     int  ->  rax        
 ;* V55 tmp11        [V55    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V56 tmp12        [V56    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V57 tmp13        [V57    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V58 tmp14        [V58    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V59 tmp15        [V59    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V60 tmp16        [V60    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V61 tmp17        [V61    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;* V62 tmp18        [V62    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V63 tmp19        [V63    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;  V64 tmp20        [V64,T16] (  5,  2.50)     int  ->  rdx         "Inline stloc first use temp"
 ;  V65 tmp21        [V65,T12] (  3,  3   )   byref  ->  rax         "Inlining Arg"
 ;* V66 tmp22        [V66    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;  V67 tmp23        [V67,T17] (  5,  2.50)     int  ->  rdx         "Inline stloc first use temp"
-;  V68 tmp24        [V68,T13] (  3,  3   )   byref  ->  rcx         "Inlining Arg"
+;  V67 tmp23        [V67,T17] (  5,  2.50)     int  ->  rcx         "Inline stloc first use temp"
+;  V68 tmp24        [V68,T13] (  3,  3   )   byref  ->  rdx         "Inlining Arg"
 ;  V69 tmp25        [V69,T07] (  7,  7   )     int  ->  rax         "Single return block return value"
 ;* V70 tmp26        [V70    ] (  0,  0   )  simd32  ->  zero-ref    "field V08._lower (fldOffset=0x0)" P-INDEP
 ;* V71 tmp27        [V71    ] (  0,  0   )  simd32  ->  zero-ref    "field V08._upper (fldOffset=0x20)" P-INDEP
 ;* V72 tmp28        [V72    ] (  0,  0   )  simd32  ->  zero-ref    "field V09._lower (fldOffset=0x0)" P-INDEP
 ;* V73 tmp29        [V73    ] (  0,  0   )  simd32  ->  zero-ref    "field V09._upper (fldOffset=0x20)" P-INDEP
 ;* V74 tmp30        [V74    ] (  0,  0   )  simd32  ->  zero-ref    "field V10._lower (fldOffset=0x0)" P-INDEP
 ;* V75 tmp31        [V75    ] (  0,  0   )  simd32  ->  zero-ref    "field V10._upper (fldOffset=0x20)" P-INDEP
 ;* V76 tmp32        [V76    ] (  0,  0   )  simd32  ->  zero-ref    "field V12._lower (fldOffset=0x0)" P-INDEP
 ;* V77 tmp33        [V77    ] (  0,  0   )  simd32  ->  zero-ref    "field V12._upper (fldOffset=0x20)" P-INDEP
 ;* V78 tmp34        [V78    ] (  0,  0   )  simd32  ->  zero-ref    "field V13._lower (fldOffset=0x0)" P-INDEP
 ;* V79 tmp35        [V79    ] (  0,  0   )  simd32  ->  zero-ref    "field V13._upper (fldOffset=0x20)" P-INDEP
 ;* V80 tmp36        [V80    ] (  0,  0   )  simd32  ->  zero-ref    "field V14._lower (fldOffset=0x0)" P-INDEP
 ;* V81 tmp37        [V81    ] (  0,  0   )  simd32  ->  zero-ref    "field V14._upper (fldOffset=0x20)" P-INDEP
 ;* V82 tmp38        [V82    ] (  0,  0   )  simd32  ->  zero-ref    "field V17._lower (fldOffset=0x0)" P-INDEP
 ;* V83 tmp39        [V83    ] (  0,  0   )  simd32  ->  zero-ref    "field V17._upper (fldOffset=0x20)" P-INDEP
 ;* V84 tmp40        [V84    ] (  0,  0   )  simd32  ->  zero-ref    "field V18._lower (fldOffset=0x0)" P-INDEP
 ;* V85 tmp41        [V85    ] (  0,  0   )  simd32  ->  zero-ref    "field V18._upper (fldOffset=0x20)" P-INDEP
 ;* V86 tmp42        [V86    ] (  0,  0   )  simd32  ->  zero-ref    "field V19._lower (fldOffset=0x0)" P-INDEP
 ;* V87 tmp43        [V87    ] (  0,  0   )  simd32  ->  zero-ref    "field V19._upper (fldOffset=0x20)" P-INDEP
 ;  V88 cse0         [V88,T08] (  7,  6.75)     int  ->  r10         hoist multi-def "CSE #01: moderate"
 ;
 ; Lcl frame size = 0
 
 G_M33471_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M33471_IG02:
        cmp      r8d, 8
        jl       G_M33471_IG10
 						;; size=10 bbWeight=1 PerfScore 1.25
 G_M33471_IG03:
        mov      rax, rdi
        cmp      r8d, 16
-       jg       G_M33471_IG05
-       movzx    rsi, sil
+       jg       SHORT G_M33471_IG05
+		  ;; NOP compensation instructions of 4 bytes.
        vmovd    xmm0, esi
        vpbroadcastb xmm0, xmm0
-       movzx    rdx, dl
        vmovd    xmm1, edx
        vpbroadcastb xmm1, xmm1
-       movzx    rcx, cl
        vmovd    xmm2, ecx
        vpbroadcastb xmm2, xmm2
        add      r8d, -8
-       movsxd   rax, r8d
-       lea      rax, bword ptr [rdi+2*rax]
+       movsxd   rsi, r8d
+       lea      rax, bword ptr [rdi+2*rsi]
        cmp      rdi, rax
-       mov      rcx, rdi
-       cmova    rcx, rax
-       vmovups  xmm3, xmmword ptr [rcx]
+       mov      rdx, rdi
+       cmova    rdx, rax
+       vmovups  xmm3, xmmword ptr [rdx]
        vpackuswb xmm3, xmm3, xmmword ptr [rax]
        vpcmpeqb xmm0, xmm3, xmm0
        vpcmpeqb xmm1, xmm3, xmm1
        vpor     xmm0, xmm1, xmm0
        vpcmpeqb xmm1, xmm3, xmm2
        vpor     xmm0, xmm1, xmm0
        vptest   xmm0, xmm0
        je       G_M33471_IG16
-       vpmovmskb edx, xmm0
-       tzcnt    edx, edx
-       cmp      edx, 8
+       vpmovmskb ecx, xmm0
+       tzcnt    ecx, ecx
+       cmp      ecx, 8
        jl       SHORT G_M33471_IG04
-       mov      rcx, rax
-       add      edx, -8
-						;; size=129 bbWeight=0.50 PerfScore 15.96
+       mov      rdx, rax
+       add      ecx, -8
+						;; size=119 bbWeight=0.50 PerfScore 15.58
 G_M33471_IG04:
-       sub      rcx, rdi
-       shr      rcx, 1
-       lea      eax, [rcx+rdx]
+       sub      rdx, rdi
+       shr      rdx, 1
+       lea      eax, [rdx+rcx]
        jmp      G_M33471_IG19
-       align    [0 bytes for IG06]
-						;; size=14 bbWeight=0.50 PerfScore 1.62
+       align    [1 bytes for IG06]
+						;; size=15 bbWeight=0.50 PerfScore 1.62
 G_M33471_IG05:
-       movzx    rsi, sil
        vmovd    xmm0, esi
        vpbroadcastb ymm0, ymm0
-       movzx    rdx, dl
        vmovd    xmm1, edx
        vpbroadcastb ymm1, ymm1
-       movzx    rcx, cl
        vmovd    xmm2, ecx
        vpbroadcastb ymm2, ymm2
        cmp      r8d, 32
        jle      SHORT G_M33471_IG07
-       lea      ecx, [r8-0x20]
-       movsxd   rcx, ecx
-       lea      rcx, bword ptr [rax+2*rcx]
-						;; size=54 bbWeight=0.50 PerfScore 7.62
+       lea      esi, [r8-0x20]
+       movsxd   rdx, esi
+       lea      rcx, bword ptr [rax+2*rdx]
+						;; size=44 bbWeight=0.50 PerfScore 7.25
 G_M33471_IG06:
        vmovups  ymm3, ymmword ptr [rax]
        vpackuswb ymm3, ymm3, ymmword ptr [rax+0x20]
        vpcmpeqb ymm4, ymm3, ymm0
        vpcmpeqb ymm5, ymm3, ymm1
        vpor     ymm4, ymm5, ymm4
        vpcmpeqb ymm3, ymm3, ymm2
        vpor     ymm3, ymm3, ymm4
        vptest   ymm3, ymm3
        jne      SHORT G_M33471_IG09
        add      rax, 64
        cmp      rax, rcx
        jb       SHORT G_M33471_IG06
 						;; size=45 bbWeight=4 PerfScore 70.67
 G_M33471_IG07:
        add      r8d, -16
        movsxd   rcx, r8d
        lea      rcx, bword ptr [rdi+2*rcx]
        cmp      rax, rcx
        cmova    rax, rcx
        vmovups  ymm3, ymmword ptr [rax]
        vpackuswb ymm3, ymm3, ymmword ptr [rcx]
        vpcmpeqb ymm0, ymm0, ymm3
        vpcmpeqb ymm1, ymm1, ymm3
        vpor     ymm0, ymm1, ymm0
        vpcmpeqb ymm1, ymm2, ymm3
        vpor     ymm0, ymm1, ymm0
        vptest   ymm0, ymm0
        je       G_M33471_IG16
        vpermq   ymm0, ymm0, -40
        vpmovmskb edx, ymm0
        tzcnt    edx, edx
        cmp      edx, 16
        jl       SHORT G_M33471_IG08
        mov      rax, rcx
        add      edx, -16
 						;; size=82 bbWeight=0.50 PerfScore 13.21
 G_M33471_IG08:
        sub      rax, rdi
        shr      rax, 1
        add      eax, edx
        jmp      G_M33471_IG19
-       align    [0 bytes for IG13]
-						;; size=13 bbWeight=0.50 PerfScore 1.50
+       align    [4 bytes for IG13]
+						;; size=17 bbWeight=0.50 PerfScore 1.50
 G_M33471_IG09:
        sub      rax, rdi
        shr      rax, 1
        vpermq   ymm0, ymm3, -40
        vpmovmskb edi, ymm0
        xor      ecx, ecx
        tzcnt    ecx, edi
        add      eax, ecx
        jmp      G_M33471_IG19
 						;; size=29 bbWeight=0.50 PerfScore 5.12
 G_M33471_IG10:
        xor      r9d, r9d
        cmp      r8d, 4
        jl       G_M33471_IG11
        add      r8d, -4
        movsx    rax, word  ptr [rdi]
        movsx    r10, si
        cmp      eax, r10d
        je       G_M33471_IG24
        movsx    r9, dx
        cmp      eax, r9d
        je       G_M33471_IG24
        movsx    r9, cx
        cmp      eax, r9d
        sete     al
        movzx    rax, al
        test     al, al
        jne      G_M33471_IG24
        movsx    rax, word  ptr [rdi+0x02]
        cmp      eax, r10d
        je       G_M33471_IG22
        movsx    r9, dx
        cmp      eax, r9d
        je       G_M33471_IG22
        movsx    r9, cx
        cmp      eax, r9d
        sete     al
        movzx    rax, al
        test     al, al
        jne      G_M33471_IG22
        movsx    rax, word  ptr [rdi+0x04]
        cmp      eax, r10d
        je       G_M33471_IG20
        movsx    r9, dx
        cmp      eax, r9d
        je       G_M33471_IG20
        movsx    r9, cx
        cmp      eax, r9d
        sete     al
        movzx    rax, al
        test     al, al
        jne      SHORT G_M33471_IG20
        movsx    rax, word  ptr [rdi+0x06]
        cmp      eax, r10d
        je       SHORT G_M33471_IG18
        movsx    r10, dx
        cmp      eax, r10d
        je       SHORT G_M33471_IG18
        movsx    r9, cx
        cmp      eax, r9d
        sete     al
        movzx    rax, al
        test     al, al
        jne      SHORT G_M33471_IG18
        mov      r9d, 4
 						;; size=202 bbWeight=0.50 PerfScore 20.62
 G_M33471_IG11:
        test     r8d, r8d
        jle      SHORT G_M33471_IG16
 						;; size=5 bbWeight=0.50 PerfScore 0.62
 G_M33471_IG12:
        movsx    r10, si
 						;; size=4 bbWeight=0.25 PerfScore 0.06
 G_M33471_IG13:
        dec      r8d
        movsx    rax, word  ptr [rdi+2*r9]
        cmp      eax, r10d
        je       SHORT G_M33471_IG17
 						;; size=13 bbWeight=4 PerfScore 22.00
 G_M33471_IG14:
        movsx    rsi, dx
        cmp      eax, esi
        je       SHORT G_M33471_IG17
        movsx    rsi, cx
        cmp      eax, esi
        sete     al
        movzx    rax, al
        test     al, al
        jne      SHORT G_M33471_IG17
 						;; size=24 bbWeight=2 PerfScore 9.00
 G_M33471_IG15:
        inc      r9
        test     r8d, r8d
        jg       SHORT G_M33471_IG13
 						;; size=8 bbWeight=4 PerfScore 6.00
 G_M33471_IG16:
        mov      eax, -1
        jmp      SHORT G_M33471_IG19
 						;; size=7 bbWeight=0.50 PerfScore 1.12
 G_M33471_IG17:
        mov      eax, r9d
        jmp      SHORT G_M33471_IG19
 						;; size=5 bbWeight=0.50 PerfScore 1.12
 G_M33471_IG18:
        mov      eax, 3
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M33471_IG19:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M33471_IG20:
        mov      eax, 2
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M33471_IG21:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M33471_IG22:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M33471_IG23:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M33471_IG24:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M33471_IG25:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 
-; Total bytes of code 685, prolog size 4, PerfScore 184.27, instruction count 195, allocated bytes for code 685 (MethodHash=2ec77d40) for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (FullOpts)
+; Total bytes of code 670, prolog size 4, PerfScore 183.52, instruction count 189, allocated bytes for code 670 (MethodHash=2ec77d40) for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short]](byref,short,short,short,int):int (FullOpts)
 ; ============================================================
-12 (-1.85 % of base) - System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort)
 ; Assembly listing for method System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort) (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 5 single block inlinees; 3 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;* V00 arg0         [V00    ] (  0,  0   )  struct (16) zero-ref    multireg-arg ld-addr-op single-def <System.ReadOnlySpan`1[ushort]>
 ;  V01 arg1         [V01,T00] ( 17,140   )   byref  ->  rbx         single-def
 ;  V02 arg2         [V02,T20] (  4,  3   )  ushort  ->  rcx         single-def
 ;  V03 arg3         [V03,T17] (  5,  5   )  ushort  ->  r15         single-def
 ;  V04 arg4         [V04,T18] (  5,  5   )  ushort  ->  r14         single-def
 ;  V05 loc0         [V05,T12] (  5, 18   )    long  ->  r13        
 ;  V06 loc1         [V06,T01] ( 16, 87   )    long  ->  r12        
 ;  V07 loc2         [V07,T13] (  4, 13   )   byref  ->  [rbp-0xE0]  spill-single-def
 ;* V08 loc3         [V08    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V09 loc4         [V09    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V10 loc5         [V10    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V11 loc6         [V11    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V12 loc7         [V12    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V13 loc8         [V13    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ushort]>
 ;* V14 loc9         [V14    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V15 loc10        [V15    ] (  0,  0   )    long  ->  zero-ref   
 ;* V16 loc11        [V16    ] (  0,  0   )     int  ->  zero-ref   
 ;  V17 loc12        [V17,T29] (  2,  4.50)  simd32  ->  [rbp-0x50]  spill-single-def <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;  V18 loc13        [V18,T30] (  2,  4.50)  simd32  ->  [rbp-0x70]  spill-single-def <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;  V19 loc14        [V19,T31] (  2,  4.50)  simd32  ->  [rbp-0x90]  spill-single-def <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;* V20 loc15        [V20    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;* V21 loc16        [V21    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;* V22 loc17        [V22    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[ushort]>
 ;  V23 loc18        [V23,T27] (  4, 14   )  simd32  ->  mm3         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V24 loc19        [V24,T02] (  5, 66   )     int  ->  [rbp-0x94] 
 ;* V25 loc20        [V25    ] (  0,  0   )     int  ->  zero-ref   
 ;  V26 loc21        [V26,T32] (  2,  4.50)  simd16  ->  [rbp-0xB0]  spill-single-def <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;  V27 loc22        [V27,T33] (  2,  4.50)  simd16  ->  [rbp-0xC0]  spill-single-def <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;  V28 loc23        [V28,T34] (  2,  4.50)  simd16  ->  [rbp-0xD0]  spill-single-def <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;* V29 loc24        [V29    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;* V30 loc25        [V30    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;* V31 loc26        [V31    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ushort]>
 ;  V32 loc27        [V32,T28] (  4, 14   )  simd16  ->  mm3         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V33 loc28        [V33,T03] (  5, 66   )     int  ->  [rbp-0xD4] 
 ;* V34 loc29        [V34    ] (  0,  0   )     int  ->  zero-ref   
 ;  V35 loc30        [V35,T14] (  4, 12   )  ushort  ->  rdi        
 ;# V36 OutArgs      [V36    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V37 tmp1         [V37,T25] (  4, 32   )  simd16  ->  mm3         "dup spill"
 ;  V38 tmp2         [V38,T26] (  4, 32   )  simd32  ->  mm3         "dup spill"
 ;* V39 tmp3         [V39    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ushort]>
 ;  V40 tmp4         [V40,T06] (  4, 48   )     int  ->   r8         "Inline stloc first use temp"
 ;* V41 tmp5         [V41    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inline stloc first use temp" <System.Span`1[int]>
 ;  V42 tmp6         [V42,T04] (  3, 64   )     int  ->  rsi         "Inlining Arg"
 ;  V43 tmp7         [V43,T07] (  4, 48   )     int  ->   r8         "Inline stloc first use temp"
 ;* V44 tmp8         [V44    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inline stloc first use temp" <System.Span`1[int]>
 ;  V45 tmp9         [V45,T05] (  3, 64   )     int  ->  rsi         "Inlining Arg"
 ;  V46 tmp10        [V46,T16] (  4,  8   )     int  ->  rcx         "Inline stloc first use temp"
 ;* V47 tmp11        [V47    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inline stloc first use temp" <System.Span`1[int]>
 ;  V48 tmp12        [V48,T15] (  3, 12   )     int  ->  rsi         "Inlining Arg"
 ;  V49 tmp13        [V49,T21] (  2,  2   )   byref  ->  rdi         single-def "field V00._reference (fldOffset=0x0)" P-INDEP
 ;  V50 tmp14        [V50,T23] (  2,  2   )     int  ->  rsi         single-def "field V00._length (fldOffset=0x8)" P-INDEP
 ;* V51 tmp15        [V51    ] (  0,  0   )  simd32  ->  zero-ref    "field V08._lower (fldOffset=0x0)" P-INDEP
 ;* V52 tmp16        [V52    ] (  0,  0   )  simd32  ->  zero-ref    "field V08._upper (fldOffset=0x20)" P-INDEP
 ;* V53 tmp17        [V53    ] (  0,  0   )  simd32  ->  zero-ref    "field V09._lower (fldOffset=0x0)" P-INDEP
 ;* V54 tmp18        [V54    ] (  0,  0   )  simd32  ->  zero-ref    "field V09._upper (fldOffset=0x20)" P-INDEP
 ;* V55 tmp19        [V55    ] (  0,  0   )  simd32  ->  zero-ref    "field V10._lower (fldOffset=0x0)" P-INDEP
 ;* V56 tmp20        [V56    ] (  0,  0   )  simd32  ->  zero-ref    "field V10._upper (fldOffset=0x20)" P-INDEP
 ;* V57 tmp21        [V57    ] (  0,  0   )  simd32  ->  zero-ref    "field V11._lower (fldOffset=0x0)" P-INDEP
 ;* V58 tmp22        [V58    ] (  0,  0   )  simd32  ->  zero-ref    "field V11._upper (fldOffset=0x20)" P-INDEP
 ;* V59 tmp23        [V59    ] (  0,  0   )  simd32  ->  zero-ref    "field V12._lower (fldOffset=0x0)" P-INDEP
 ;* V60 tmp24        [V60    ] (  0,  0   )  simd32  ->  zero-ref    "field V12._upper (fldOffset=0x20)" P-INDEP
 ;* V61 tmp25        [V61    ] (  0,  0   )  simd32  ->  zero-ref    "field V13._lower (fldOffset=0x0)" P-INDEP
 ;* V62 tmp26        [V62    ] (  0,  0   )  simd32  ->  zero-ref    "field V13._upper (fldOffset=0x20)" P-INDEP
 ;* V63 tmp27        [V63    ] (  0,  0   )  simd32  ->  zero-ref    "field V14._lower (fldOffset=0x0)" P-INDEP
 ;* V64 tmp28        [V64    ] (  0,  0   )  simd32  ->  zero-ref    "field V14._upper (fldOffset=0x20)" P-INDEP
 ;* V65 tmp29        [V65    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V39._reference (fldOffset=0x0)" P-INDEP
 ;* V66 tmp30        [V66    ] (  0,  0   )     int  ->  zero-ref    "field V39._length (fldOffset=0x8)" P-INDEP
 ;  V67 tmp31        [V67,T10] (  2, 24   )   byref  ->   r9         "field V41._reference (fldOffset=0x0)" P-INDEP
 ;  V68 tmp32        [V68,T08] (  2, 32   )     int  ->  rdi         "field V41._length (fldOffset=0x8)" P-INDEP
 ;  V69 tmp33        [V69,T11] (  2, 24   )   byref  ->   r9         "field V44._reference (fldOffset=0x0)" P-INDEP
 ;  V70 tmp34        [V70,T09] (  2, 32   )     int  ->  rdi         "field V44._length (fldOffset=0x8)" P-INDEP
 ;  V71 tmp35        [V71,T22] (  2,  4   )   byref  ->   r8         "field V47._reference (fldOffset=0x0)" P-INDEP
 ;  V72 tmp36        [V72,T24] (  2,  4   )     int  ->  rdi         "field V47._length (fldOffset=0x8)" P-INDEP
 ;  V73 cse0         [V73,T19] (  5,  6   )     int  ->  [rbp-0xD8]  multi-def "CSE #01: moderate"
 ;
 ; Lcl frame size = 184
 
 G_M10293_IG01:
        push     rbp
        push     r15
        push     r14
        push     r13
        push     r12
        push     rbx
        sub      rsp, 184
        lea      rbp, [rsp+0xE0]
        mov      rbx, rdx
        mov      r15d, r8d
        mov      r14d, r9d
 						;; size=34 bbWeight=1 PerfScore 7.50
 G_M10293_IG02:
        mov      r13d, esi
        xor      r12d, r12d
        mov      rax, rdi
        mov      bword ptr [rbp-0xE0], rax
        cmp      r13, 32
        jae      G_M10293_IG12
 						;; size=26 bbWeight=1 PerfScore 3.00
 G_M10293_IG03:
        movzx    rdx, cx
        mov      dword ptr [rbp-0xD8], edx
        vmovd    xmm0, edx
        vpbroadcastw xmm0, xmm0
        vmovaps  xmmword ptr [rbp-0xB0], xmm0
-       movzx    rdi, r15w
-       vmovd    xmm1, edi
+       vmovd    xmm1, r15d
        vpbroadcastw xmm1, xmm1
        vmovaps  xmmword ptr [rbp-0xC0], xmm1
-       movzx    rdi, r14w
-       vmovd    xmm2, edi
+       vmovd    xmm2, r14d
        vpbroadcastw xmm2, xmm2
        vmovaps  xmmword ptr [rbp-0xD0], xmm2
        jmp      SHORT G_M10293_IG05
-						;; size=70 bbWeight=0.50 PerfScore 7.88
+						;; size=64 bbWeight=0.50 PerfScore 7.62
 G_M10293_IG04:
        mov      rax, bword ptr [rbp-0xE0]
 						;; size=7 bbWeight=2 PerfScore 2.00
 G_M10293_IG05:
        vmovups  xmm3, xmmword ptr [rax+2*r12]
        vpcmpeqw xmm4, xmm3, xmm0
        vpcmpeqw xmm5, xmm3, xmm1
        vpor     xmm4, xmm5, xmm4
        vpcmpeqw xmm3, xmm3, xmm2
        vpor     xmm3, xmm3, xmm4
        vptest   xmm3, xmm3
        je       SHORT G_M10293_IG11
 						;; size=33 bbWeight=4 PerfScore 40.67
 G_M10293_IG06:
        vpmovmskb ecx, xmm3
        and      ecx, 0x5555
 						;; size=10 bbWeight=2 PerfScore 4.50
 G_M10293_IG07:
        mov      dword ptr [rbp-0xD4], ecx
        xor      edi, edi
        tzcnt    edi, ecx
        shr      edi, 1
        mov      esi, edi
        add      esi, r12d
        mov      r8d, dword ptr [rbx+0x08]
        mov      r9, bword ptr [rbx+0x10]
        mov      edi, dword ptr [rbx+0x18]
        cmp      r8d, edi
        jb       SHORT G_M10293_IG09
 						;; size=35 bbWeight=16 PerfScore 184.00
 G_M10293_IG08:
        mov      rdi, rbx
        mov      r8, 0xD1FFAB1E      ; code for System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
        call     [r8]System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
        jmp      SHORT G_M10293_IG10
 						;; size=18 bbWeight=8 PerfScore 44.00
 G_M10293_IG09:
        mov      edi, r8d
        mov      dword ptr [r9+4*rdi], esi
        inc      r8d
        mov      dword ptr [rbx+0x08], r8d
 						;; size=14 bbWeight=8 PerfScore 20.00
 G_M10293_IG10:
        blsr     ecx, dword ptr [rbp-0xD4]
        jne      SHORT G_M10293_IG07
 						;; size=11 bbWeight=16 PerfScore 48.00
 G_M10293_IG11:
        add      r12, 8
        lea      rdi, [r13-0x08]
        cmp      r12, rdi
        vmovaps  xmm0, xmmword ptr [rbp-0xB0]
        vmovaps  xmm1, xmmword ptr [rbp-0xC0]
        vmovaps  xmm2, xmmword ptr [rbp-0xD0]
        jbe      G_M10293_IG04
        jmp      G_M10293_IG22
 						;; size=46 bbWeight=4 PerfScore 52.00
 G_M10293_IG12:
        movzx    rdx, cx
        mov      dword ptr [rbp-0xD8], edx
        vmovd    xmm0, edx
        vpbroadcastw ymm0, ymm0
        vmovups  ymmword ptr [rbp-0x50], ymm0
-       movzx    rdi, r15w
-       vmovd    xmm1, edi
+       vmovd    xmm1, r15d
        vpbroadcastw ymm1, ymm1
        vmovups  ymmword ptr [rbp-0x70], ymm1
-       movzx    rdi, r14w
-       vmovd    xmm2, edi
+       vmovd    xmm2, r14d
        vpbroadcastw ymm2, ymm2
        vmovups  ymmword ptr [rbp-0x90], ymm2
        jmp      SHORT G_M10293_IG14
-						;; size=64 bbWeight=0.50 PerfScore 9.38
+						;; size=58 bbWeight=0.50 PerfScore 9.12
 G_M10293_IG13:
        mov      rax, bword ptr [rbp-0xE0]
 						;; size=7 bbWeight=2 PerfScore 2.00
 G_M10293_IG14:
        vmovups  ymm3, ymmword ptr [rax+2*r12]
        vpcmpeqw ymm4, ymm3, ymm0
        vpcmpeqw ymm5, ymm3, ymm1
        vpor     ymm4, ymm5, ymm4
        vpcmpeqw ymm3, ymm3, ymm2
        vpor     ymm3, ymm3, ymm4
        vptest   ymm3, ymm3
        je       SHORT G_M10293_IG20
 						;; size=33 bbWeight=4 PerfScore 52.67
 G_M10293_IG15:
        vpmovmskb ecx, ymm3
        and      ecx, 0xD1FFAB1E
 						;; size=10 bbWeight=2 PerfScore 6.50
 G_M10293_IG16:
        mov      dword ptr [rbp-0x94], ecx
        xor      edi, edi
        tzcnt    edi, ecx
        shr      edi, 1
        mov      esi, edi
        add      esi, r12d
        mov      r8d, dword ptr [rbx+0x08]
        mov      r9, bword ptr [rbx+0x10]
        mov      edi, dword ptr [rbx+0x18]
        cmp      r8d, edi
        jb       SHORT G_M10293_IG18
 						;; size=35 bbWeight=16 PerfScore 184.00
 G_M10293_IG17:
        mov      rdi, rbx
        mov      r8, 0xD1FFAB1E      ; code for System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
        call     [r8]System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
        jmp      SHORT G_M10293_IG19
 						;; size=18 bbWeight=8 PerfScore 44.00
 G_M10293_IG18:
        mov      edi, r8d
        mov      dword ptr [r9+4*rdi], esi
        inc      r8d
        mov      dword ptr [rbx+0x08], r8d
 						;; size=14 bbWeight=8 PerfScore 20.00
 G_M10293_IG19:
        blsr     ecx, dword ptr [rbp-0x94]
        jne      SHORT G_M10293_IG16
 						;; size=11 bbWeight=16 PerfScore 48.00
 G_M10293_IG20:
        add      r12, 16
        lea      rdi, [r13-0x10]
        cmp      r12, rdi
        vmovups  ymm0, ymmword ptr [rbp-0x50]
        vmovups  ymm1, ymmword ptr [rbp-0x70]
        vmovups  ymm2, ymmword ptr [rbp-0x90]
        jbe      G_M10293_IG13
        jmp      SHORT G_M10293_IG22
 						;; size=37 bbWeight=4 PerfScore 64.00
 G_M10293_IG21:
        inc      r12
 						;; size=3 bbWeight=4 PerfScore 1.00
 G_M10293_IG22:
        cmp      r12, r13
        jae      SHORT G_M10293_IG27
 						;; size=5 bbWeight=8 PerfScore 10.00
 G_M10293_IG23:
        mov      rax, bword ptr [rbp-0xE0]
        movzx    rdi, word  ptr [rax+2*r12]
        mov      edx, dword ptr [rbp-0xD8]
        cmp      edi, edx
        je       SHORT G_M10293_IG25
 						;; size=22 bbWeight=4 PerfScore 21.00
 G_M10293_IG24:
        movzx    rsi, r15w
        cmp      edi, esi
        je       SHORT G_M10293_IG25
        movzx    rsi, r14w
        cmp      edi, esi
        jne      SHORT G_M10293_IG21
 						;; size=16 bbWeight=2 PerfScore 6.00
 G_M10293_IG25:
        mov      esi, r12d
        mov      ecx, dword ptr [rbx+0x08]
        mov      r8, bword ptr [rbx+0x10]
        mov      edi, dword ptr [rbx+0x18]
        cmp      ecx, edi
        jb       SHORT G_M10293_IG26
        mov      rdi, rbx
        mov      rcx, 0xD1FFAB1E      ; code for System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
        call     [rcx]System.Collections.Generic.ValueListBuilder`1[int]:AddWithResize(int):this
        jmp      SHORT G_M10293_IG21
 						;; size=34 bbWeight=2 PerfScore 26.00
 G_M10293_IG26:
        mov      edi, ecx
        mov      dword ptr [r8+4*rdi], esi
        inc      ecx
        mov      dword ptr [rbx+0x08], ecx
        jmp      SHORT G_M10293_IG21
 						;; size=13 bbWeight=2 PerfScore 9.00
 G_M10293_IG27:
        vzeroupper 
        add      rsp, 184
        pop      rbx
        pop      r12
        pop      r13
        pop      r14
        pop      r15
        pop      rbp
        ret      
 						;; size=21 bbWeight=1 PerfScore 5.25
 
-; Total bytes of code 647, prolog size 34, PerfScore 922.33, instruction count 163, allocated bytes for code 647 (MethodHash=8ebed7ca) for method System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort) (FullOpts)
+; Total bytes of code 635, prolog size 34, PerfScore 921.83, instruction count 159, allocated bytes for code 635 (MethodHash=8ebed7ca) for method System.String:MakeSeparatorListVectorized(System.ReadOnlySpan`1[ushort],byref,ushort,ushort,ushort) (FullOpts)
 ; ============================================================
-10 (-1.78 % of base) - System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,short,int):int
 ; Assembly listing for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,short,int):int (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 19 single block inlinees; 5 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T03] ( 15, 12   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T09] (  6,  3.75)   short  ->  rsi         single-def
 ;  V02 arg2         [V02,T05] (  9,  7   )   short  ->  rdx         single-def
 ;  V03 arg3         [V03,T01] ( 15, 19.50)     int  ->  rcx        
 ;  V04 loc0         [V04,T04] (  6, 13.50)    long  ->   r8        
 ;  V05 loc1         [V05,T02] ( 15, 16   )   short  ->  rax        
 ;  V06 loc2         [V06,T00] ( 10, 22.50)   byref  ->  rax        
 ;* V07 loc3         [V07    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V08 loc4         [V08    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V09 loc5         [V09    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V10 loc6         [V10    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V11 loc7         [V11    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V12 loc8         [V12    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V13 loc9         [V13    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V14 loc10        [V14    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V15 loc11        [V15    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V16 loc12        [V16    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V17 loc13        [V17    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V18 loc14        [V18,T25] (  3,  5   )  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V19 loc15        [V19,T26] (  3,  5   )  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V20 loc16        [V20,T10] (  2,  4.50)   byref  ->  rdx         single-def
 ;* V21 loc17        [V21    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V22 loc18        [V22,T24] (  3, 12   )  simd32  ->  mm2         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V23 loc19        [V23,T23] (  4, 12.50)  simd32  ->  mm2         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V24 loc20        [V24,T13] (  5,  2.50)   byref  ->  rcx         single-def
 ;* V25 loc21        [V25    ] (  0,  0   )   byref  ->  zero-ref    single-def
 ;* V26 loc22        [V26    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V27 loc23        [V27,T29] (  3,  1.50)  simd32  ->  mm2         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V28 loc24        [V28,T27] (  4,  2   )  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V29 loc25        [V29,T31] (  2,  1   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V30 loc26        [V30,T32] (  2,  1   )  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V31 loc27        [V31    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V32 loc28        [V32    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V33 loc29        [V33    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V34 loc30        [V34    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V35 loc31        [V35,T14] (  5,  2.50)   byref  ->  rax         single-def
 ;* V36 loc32        [V36    ] (  0,  0   )   byref  ->  zero-ref    single-def
 ;* V37 loc33        [V37    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V38 loc34        [V38,T30] (  3,  1.50)  simd16  ->  mm2         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V39 loc35        [V39,T28] (  4,  2   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;# V40 OutArgs      [V40    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V41 tmp1         [V41,T17] (  3,  1.50)   byref  ->  rcx        
+;  V41 tmp1         [V41,T17] (  3,  1.50)   byref  ->  rdx        
 ;* V42 tmp2         [V42    ] (  0,  0   )  simd16  ->  zero-ref    "impAppendStmt"
 ;* V43 tmp3         [V43    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
 ;  V44 tmp4         [V44,T18] (  3,  1.50)   byref  ->  rax        
 ;* V45 tmp5         [V45    ] (  0,  0   )  simd32  ->  zero-ref    "impAppendStmt"
 ;* V46 tmp6         [V46    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;* V47 tmp7         [V47    ] (  0,  0   )  simd32  ->  zero-ref    "impAppendStmt"
 ;* V48 tmp8         [V48    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;  V49 tmp9         [V49,T08] (  2,  4   )     int  ->  rax        
 ;  V50 tmp10        [V50,T19] (  2,  1   )     int  ->  rax        
 ;  V51 tmp11        [V51,T20] (  2,  1   )     int  ->  rax        
 ;  V52 tmp12        [V52,T21] (  2,  1   )     int  ->  rax        
 ;  V53 tmp13        [V53,T22] (  2,  1   )     int  ->  rax        
 ;* V54 tmp14        [V54    ] (  0,  0   )   short  ->  zero-ref    "Inlining Arg"
 ;* V55 tmp15        [V55    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V56 tmp16        [V56    ] (  0,  0   )   short  ->  zero-ref    "Inlining Arg"
 ;* V57 tmp17        [V57    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V58 tmp18        [V58    ] (  0,  0   )   short  ->  zero-ref    "Inlining Arg"
 ;* V59 tmp19        [V59    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V60 tmp20        [V60    ] (  0,  0   )   short  ->  zero-ref    "Inlining Arg"
 ;* V61 tmp21        [V61    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V62 tmp22        [V62    ] (  0,  0   )   short  ->  zero-ref    "Inlining Arg"
 ;* V63 tmp23        [V63    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V64 tmp24        [V64    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V65 tmp25        [V65    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;* V66 tmp26        [V66    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V67 tmp27        [V67    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;  V68 tmp28        [V68,T15] (  5,  2.50)     int  ->  rdx         "Inline stloc first use temp"
 ;  V69 tmp29        [V69,T11] (  3,  3   )   byref  ->  rax         "Inlining Arg"
 ;* V70 tmp30        [V70    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;  V71 tmp31        [V71,T16] (  5,  2.50)     int  ->  rdx         "Inline stloc first use temp"
-;  V72 tmp32        [V72,T12] (  3,  3   )   byref  ->  rcx         "Inlining Arg"
+;  V71 tmp31        [V71,T16] (  5,  2.50)     int  ->  rcx         "Inline stloc first use temp"
+;  V72 tmp32        [V72,T12] (  3,  3   )   byref  ->  rdx         "Inlining Arg"
 ;  V73 tmp33        [V73,T06] (  7,  7   )     int  ->  rax         "Single return block return value"
 ;* V74 tmp34        [V74    ] (  0,  0   )  simd32  ->  zero-ref    "field V07._lower (fldOffset=0x0)" P-INDEP
 ;* V75 tmp35        [V75    ] (  0,  0   )  simd32  ->  zero-ref    "field V07._upper (fldOffset=0x20)" P-INDEP
 ;* V76 tmp36        [V76    ] (  0,  0   )  simd32  ->  zero-ref    "field V08._lower (fldOffset=0x0)" P-INDEP
 ;* V77 tmp37        [V77    ] (  0,  0   )  simd32  ->  zero-ref    "field V08._upper (fldOffset=0x20)" P-INDEP
 ;* V78 tmp38        [V78    ] (  0,  0   )  simd32  ->  zero-ref    "field V10._lower (fldOffset=0x0)" P-INDEP
 ;* V79 tmp39        [V79    ] (  0,  0   )  simd32  ->  zero-ref    "field V10._upper (fldOffset=0x20)" P-INDEP
 ;* V80 tmp40        [V80    ] (  0,  0   )  simd32  ->  zero-ref    "field V11._lower (fldOffset=0x0)" P-INDEP
 ;* V81 tmp41        [V81    ] (  0,  0   )  simd32  ->  zero-ref    "field V11._upper (fldOffset=0x20)" P-INDEP
 ;* V82 tmp42        [V82    ] (  0,  0   )  simd32  ->  zero-ref    "field V12._lower (fldOffset=0x0)" P-INDEP
 ;* V83 tmp43        [V83    ] (  0,  0   )  simd32  ->  zero-ref    "field V12._upper (fldOffset=0x20)" P-INDEP
 ;* V84 tmp44        [V84    ] (  0,  0   )  simd32  ->  zero-ref    "field V15._lower (fldOffset=0x0)" P-INDEP
 ;* V85 tmp45        [V85    ] (  0,  0   )  simd32  ->  zero-ref    "field V15._upper (fldOffset=0x20)" P-INDEP
 ;* V86 tmp46        [V86    ] (  0,  0   )  simd32  ->  zero-ref    "field V16._lower (fldOffset=0x0)" P-INDEP
 ;* V87 tmp47        [V87    ] (  0,  0   )  simd32  ->  zero-ref    "field V16._upper (fldOffset=0x20)" P-INDEP
 ;* V88 tmp48        [V88    ] (  0,  0   )  simd32  ->  zero-ref    "field V17._lower (fldOffset=0x0)" P-INDEP
 ;* V89 tmp49        [V89    ] (  0,  0   )  simd32  ->  zero-ref    "field V17._upper (fldOffset=0x20)" P-INDEP
 ;  V90 cse0         [V90,T07] (  7,  6.75)     int  ->   r9         hoist multi-def "CSE #01: moderate"
 ;
 ; Lcl frame size = 0
 
 G_M59761_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M59761_IG02:
        cmp      ecx, 8
        jl       G_M59761_IG10
 						;; size=9 bbWeight=1 PerfScore 1.25
 G_M59761_IG03:
        mov      rax, rdi
        cmp      ecx, 16
        jg       SHORT G_M59761_IG05
-       movzx    rsi, sil
        vmovd    xmm0, esi
        vpbroadcastb xmm0, xmm0
-       movzx    rdx, dl
        vmovd    xmm1, edx
        vpbroadcastb xmm1, xmm1
        add      ecx, -8
-       movsxd   rax, ecx
-       lea      rax, bword ptr [rdi+2*rax]
+       movsxd   rsi, ecx
+       lea      rax, bword ptr [rdi+2*rsi]
        cmp      rdi, rax
-       mov      rcx, rdi
-       cmova    rcx, rax
-       vmovups  xmm2, xmmword ptr [rcx]
+       mov      rdx, rdi
+       cmova    rdx, rax
+       vmovups  xmm2, xmmword ptr [rdx]
        vpackuswb xmm2, xmm2, xmmword ptr [rax]
        vpcmpeqb xmm0, xmm2, xmm0
        vpcmpeqb xmm1, xmm2, xmm1
        vpor     xmm0, xmm1, xmm0
        vptest   xmm0, xmm0
        je       G_M59761_IG16
-       vpmovmskb edx, xmm0
-       tzcnt    edx, edx
-       cmp      edx, 8
+       vpmovmskb ecx, xmm0
+       tzcnt    ecx, ecx
+       cmp      ecx, 8
        jl       SHORT G_M59761_IG04
-       mov      rcx, rax
-       add      edx, -8
-						;; size=103 bbWeight=0.50 PerfScore 13.92
+       mov      rdx, rax
+       add      ecx, -8
+						;; size=96 bbWeight=0.50 PerfScore 13.67
 G_M59761_IG04:
-       sub      rcx, rdi
-       shr      rcx, 1
-       lea      eax, [rcx+rdx]
+       sub      rdx, rdi
+       shr      rdx, 1
+       lea      eax, [rdx+rcx]
        jmp      G_M59761_IG19
-       align    [0 bytes for IG06]
-						;; size=14 bbWeight=0.50 PerfScore 1.62
+       align    [4 bytes for IG06]
+						;; size=18 bbWeight=0.50 PerfScore 1.62
 G_M59761_IG05:
-       movzx    rsi, sil
        vmovd    xmm0, esi
        vpbroadcastb ymm0, ymm0
-       movzx    rdx, dl
        vmovd    xmm1, edx
        vpbroadcastb ymm1, ymm1
        cmp      ecx, 32
        jle      SHORT G_M59761_IG07
-       lea      edx, [rcx-0x20]
-       movsxd   rdx, edx
+       lea      esi, [rcx-0x20]
+       movsxd   rdx, esi
        lea      rdx, bword ptr [rax+2*rdx]
-						;; size=40 bbWeight=0.50 PerfScore 5.50
+						;; size=33 bbWeight=0.50 PerfScore 5.25
 G_M59761_IG06:
        vmovups  ymm2, ymmword ptr [rax]
        vpackuswb ymm2, ymm2, ymmword ptr [rax+0x20]
        vpcmpeqb ymm3, ymm2, ymm0
        vpcmpeqb ymm2, ymm2, ymm1
        vpor     ymm2, ymm2, ymm3
        vptest   ymm2, ymm2
        jne      SHORT G_M59761_IG09
        add      rax, 64
        cmp      rax, rdx
        jb       SHORT G_M59761_IG06
 						;; size=37 bbWeight=4 PerfScore 67.33
 G_M59761_IG07:
        add      ecx, -16
        movsxd   rcx, ecx
        lea      rcx, bword ptr [rdi+2*rcx]
        cmp      rax, rcx
        cmova    rax, rcx
        vmovups  ymm2, ymmword ptr [rax]
        vpackuswb ymm2, ymm2, ymmword ptr [rcx]
        vpcmpeqb ymm0, ymm0, ymm2
        vpcmpeqb ymm1, ymm1, ymm2
        vpor     ymm0, ymm1, ymm0
        vptest   ymm0, ymm0
        je       G_M59761_IG16
        vpermq   ymm0, ymm0, -40
        vpmovmskb edx, ymm0
        tzcnt    edx, edx
        cmp      edx, 16
        jl       SHORT G_M59761_IG08
        mov      rax, rcx
        add      edx, -16
 						;; size=73 bbWeight=0.50 PerfScore 12.79
 G_M59761_IG08:
        sub      rax, rdi
        shr      rax, 1
        add      eax, edx
        jmp      G_M59761_IG19
        align    [0 bytes for IG13]
 						;; size=13 bbWeight=0.50 PerfScore 1.50
 G_M59761_IG09:
        sub      rax, rdi
        shr      rax, 1
        vpermq   ymm0, ymm2, -40
        vpmovmskb edi, ymm0
        xor      ecx, ecx
        tzcnt    ecx, edi
        add      eax, ecx
        jmp      G_M59761_IG19
 						;; size=29 bbWeight=0.50 PerfScore 5.12
 G_M59761_IG10:
        xor      r8d, r8d
        cmp      ecx, 4
        jl       G_M59761_IG11
        add      ecx, -4
        movsx    rax, word  ptr [rdi]
        movsx    r9, si
        cmp      eax, r9d
        je       G_M59761_IG24
        movsx    r8, dx
        cmp      eax, r8d
        sete     al
        movzx    rax, al
        test     al, al
        jne      G_M59761_IG24
        movsx    rax, word  ptr [rdi+0x02]
        cmp      eax, r9d
        je       G_M59761_IG22
        movsx    r8, dx
        cmp      eax, r8d
        sete     al
        movzx    rax, al
        test     al, al
        jne      G_M59761_IG22
        movsx    rax, word  ptr [rdi+0x04]
        cmp      eax, r9d
        je       SHORT G_M59761_IG20
        movsx    r8, dx
        cmp      eax, r8d
        sete     al
        movzx    rax, al
        test     al, al
        jne      SHORT G_M59761_IG20
        movsx    rax, word  ptr [rdi+0x06]
        cmp      eax, r9d
        je       SHORT G_M59761_IG18
        movsx    r9, dx
        cmp      eax, r9d
        sete     al
        movzx    rax, al
        test     al, al
        jne      SHORT G_M59761_IG18
        mov      r8d, 4
 						;; size=148 bbWeight=0.50 PerfScore 17.62
 G_M59761_IG11:
        test     ecx, ecx
        jle      SHORT G_M59761_IG16
 						;; size=4 bbWeight=0.50 PerfScore 0.62
 G_M59761_IG12:
        movsx    r9, si
 						;; size=4 bbWeight=0.25 PerfScore 0.06
 G_M59761_IG13:
        dec      ecx
        movsx    rax, word  ptr [rdi+2*r8]
        cmp      eax, r9d
        je       SHORT G_M59761_IG17
 						;; size=12 bbWeight=4 PerfScore 22.00
 G_M59761_IG14:
        movsx    rsi, dx
        cmp      eax, esi
        sete     al
        movzx    rax, al
        test     al, al
        jne      SHORT G_M59761_IG17
 						;; size=16 bbWeight=2 PerfScore 6.00
 G_M59761_IG15:
        inc      r8
        test     ecx, ecx
        jg       SHORT G_M59761_IG13
 						;; size=7 bbWeight=4 PerfScore 6.00
 G_M59761_IG16:
        mov      eax, -1
        jmp      SHORT G_M59761_IG19
 						;; size=7 bbWeight=0.50 PerfScore 1.12
 G_M59761_IG17:
        mov      eax, r8d
        jmp      SHORT G_M59761_IG19
 						;; size=5 bbWeight=0.50 PerfScore 1.12
 G_M59761_IG18:
        mov      eax, 3
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M59761_IG19:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M59761_IG20:
        mov      eax, 2
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M59761_IG21:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M59761_IG22:
        mov      eax, 1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M59761_IG23:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M59761_IG24:
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M59761_IG25:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 
-; Total bytes of code 562, prolog size 4, PerfScore 170.35, instruction count 168, allocated bytes for code 566 (MethodHash=49ec168e) for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,short,int):int (FullOpts)
+; Total bytes of code 552, prolog size 4, PerfScore 169.85, instruction count 164, allocated bytes for code 552 (MethodHash=49ec168e) for method System.PackedSpanHelpers:IndexOfAny[System.SpanHelpers+DontNegate`1[short],System.PackedSpanHelpers+NopTransform](byref,short,short,int):int (FullOpts)
 ; ============================================================
-8 (-16.33 % of base) - System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short]
 ; Assembly listing for method System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T01] (  3,  3   )   short  ->  rsi         single-def
 ;  V02 arg1         [V02,T02] (  3,  3   )   short  ->  rdx         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;
 ; Lcl frame size = 0
 
 G_M54365_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M54365_IG02:
-       movsx    rax, dx
-       vmovd    xmm0, eax
+       vmovd    xmm0, edx
        vpbroadcastw ymm0, ymm0
        vpmullw  ymm0, ymm0, ymmword ptr [reloc @RWD00]
-       movsx    rax, si
-       vmovd    xmm1, eax
+       vmovd    xmm1, esi
        vpbroadcastw ymm1, ymm1
        vpaddw   ymm0, ymm1, ymm0
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
-						;; size=45 bbWeight=1 PerfScore 17.08
+						;; size=37 bbWeight=1 PerfScore 16.58
 G_M54365_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 RWD00  	dq	0003000200010000h, 0007000600050004h, 000B000A00090008h, 000F000E000D000Ch
 
 
-; Total bytes of code 49, prolog size 0, PerfScore 19.08, instruction count 12, allocated bytes for code 49 (MethodHash=28852ba2) for method System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 41, prolog size 0, PerfScore 18.58, instruction count 10, allocated bytes for code 41 (MethodHash=28852ba2) for method System.Numerics.Vector:CreateSequence[short](short,short):System.Numerics.Vector`1[short] (FullOpts)
 ; ============================================================
-8 (-17.39 % of base) - System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short]
 ; Assembly listing for method System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T01] (  3,  3   )   short  ->  rsi         single-def
 ;  V02 arg1         [V02,T02] (  3,  3   )   short  ->  rdx         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;
 ; Lcl frame size = 0
 
 G_M32125_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M32125_IG02:
-       movsx    rax, dx
-       vmovd    xmm0, eax
+       vmovd    xmm0, edx
        vpbroadcastw xmm0, xmm0
        vpmullw  xmm0, xmm0, xmmword ptr [reloc @RWD00]
-       movsx    rax, si
-       vmovd    xmm1, eax
+       vmovd    xmm1, esi
        vpbroadcastw xmm1, xmm1
        vpaddw   xmm0, xmm1, xmm0
        vmovups  xmmword ptr [rdi], xmm0
        mov      rax, rdi
-						;; size=45 bbWeight=1 PerfScore 15.08
+						;; size=37 bbWeight=1 PerfScore 14.58
 G_M32125_IG03:
        ret      
 						;; size=1 bbWeight=1 PerfScore 1.00
 RWD00  	dq	0003000200010000h, 0007000600050004h
 
 
-; Total bytes of code 46, prolog size 0, PerfScore 16.08, instruction count 11, allocated bytes for code 46 (MethodHash=a9108282) for method System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short] (FullOpts)
+; Total bytes of code 38, prolog size 0, PerfScore 15.58, instruction count 9, allocated bytes for code 38 (MethodHash=a9108282) for method System.Runtime.Intrinsics.Vector128:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector128`1[short] (FullOpts)
 ; ============================================================
-8 (-16.33 % of base) - System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short]
 ; Assembly listing for method System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T01] (  3,  3   )   short  ->  rsi         single-def
 ;  V02 arg1         [V02,T02] (  3,  3   )   short  ->  rdx         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;
 ; Lcl frame size = 0
 
 G_M9853_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M9853_IG02:
-       movsx    rax, dx
-       vmovd    xmm0, eax
+       vmovd    xmm0, edx
        vpbroadcastw ymm0, ymm0
        vpmullw  ymm0, ymm0, ymmword ptr [reloc @RWD00]
-       movsx    rax, si
-       vmovd    xmm1, eax
+       vmovd    xmm1, esi
        vpbroadcastw ymm1, ymm1
        vpaddw   ymm0, ymm1, ymm0
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
-						;; size=45 bbWeight=1 PerfScore 17.08
+						;; size=37 bbWeight=1 PerfScore 16.58
 G_M9853_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 RWD00  	dq	0003000200010000h, 0007000600050004h, 000B000A00090008h, 000F000E000D000Ch
 
 
-; Total bytes of code 49, prolog size 0, PerfScore 19.08, instruction count 12, allocated bytes for code 49 (MethodHash=587dd982) for method System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short] (FullOpts)
+; Total bytes of code 41, prolog size 0, PerfScore 18.58, instruction count 10, allocated bytes for code 41 (MethodHash=587dd982) for method System.Runtime.Intrinsics.Vector256:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector256`1[short] (FullOpts)
 ; ============================================================
-8 (-7.69 % of base) - System.Runtime.Intrinsics.Vector512:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector512`1[short]
 ; Assembly listing for method System.Runtime.Intrinsics.Vector512:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector512`1[short] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 14 single block inlinees; 7 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T01] (  5,  5   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T02] (  3,  3   )   short  ->  rsi         single-def
 ;  V02 arg1         [V02,T03] (  3,  3   )   short  ->  rdx         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V04 tmp1         [V04    ] (  0,  0   )  struct (64) zero-ref    "spilled call-like call argument" <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V05 tmp2         [V05    ] (  0,  0   )  struct (64) zero-ref    "impAppendStmt" <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V06 tmp3         [V06    ] (  0,  0   )  struct (64) zero-ref    "spilled call-like call argument" <System.Runtime.Intrinsics.Vector512`1[short]>
 ;  V07 tmp4         [V07    ] (  3,  6   )  struct (64) [rbp-0x40]  do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[short]>
 ;  V08 tmp5         [V08,T00] (  6, 20.25)     int  ->  rax         "Inline stloc first use temp"
 ;* V09 tmp6         [V09    ] (  0,  0   )   short  ->  zero-ref    "Inline return value spill temp"
 ;* V10 tmp7         [V10    ] (  0,  0   )  struct (64) zero-ref    ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[short]>
 ;  V11 tmp8         [V11,T05] (  2,  4   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V12 tmp9         [V12,T06] (  2,  4   )  simd32  ->  mm1         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V13 tmp10        [V13,T04] (  3,  6   )  simd32  ->  mm2         "dup spill"
 ;* V14 tmp11        [V14    ] (  0,  0   )  struct (64) zero-ref    ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V15 tmp12        [V15    ] (  0,  0   )  struct (64) zero-ref    ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[short]>
 ;  V16 tmp13        [V16,T07] (  2,  4   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V17 tmp14        [V17,T08] (  2,  4   )  simd32  ->  mm1         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V18 tmp15        [V18,T10] (  2,  2   )  simd32  ->  mm0         "field V04._lower (fldOffset=0x0)" P-INDEP
 ;  V19 tmp16        [V19,T11] (  2,  2   )  simd32  ->  mm1         "field V04._upper (fldOffset=0x20)" P-INDEP
 ;* V20 tmp17        [V20    ] (  0,  0   )  simd32  ->  zero-ref    "field V05._lower (fldOffset=0x0)" P-INDEP
 ;* V21 tmp18        [V21    ] (  0,  0   )  simd32  ->  zero-ref    "field V05._upper (fldOffset=0x20)" P-INDEP
 ;* V22 tmp19        [V22    ] (  0,  0   )  simd32  ->  zero-ref    "field V06._lower (fldOffset=0x0)" P-INDEP
 ;* V23 tmp20        [V23    ] (  0,  0   )  simd32  ->  zero-ref    "field V06._upper (fldOffset=0x20)" P-INDEP
 ;  V24 tmp21        [V24    ] (  2,  5   )  simd32  ->  [rbp-0x40]  do-not-enreg[XS] addr-exposed "field V07._lower (fldOffset=0x0)" P-DEP
 ;  V25 tmp22        [V25    ] (  2,  5   )  simd32  ->  [rbp-0x20]  do-not-enreg[XS] addr-exposed "field V07._upper (fldOffset=0x20)" P-DEP
 ;* V26 tmp23        [V26    ] (  0,  0   )  simd32  ->  zero-ref    "field V10._lower (fldOffset=0x0)" P-INDEP
 ;* V27 tmp24        [V27    ] (  0,  0   )  simd32  ->  zero-ref    "field V10._upper (fldOffset=0x20)" P-INDEP
 ;* V28 tmp25        [V28    ] (  0,  0   )  simd32  ->  zero-ref    "field V14._lower (fldOffset=0x0)" P-INDEP
 ;* V29 tmp26        [V29    ] (  0,  0   )  simd32  ->  zero-ref    "field V14._upper (fldOffset=0x20)" P-INDEP
 ;  V30 tmp27        [V30,T12] (  2,  2   )  simd32  ->  mm0         "field V15._lower (fldOffset=0x0)" P-INDEP
 ;  V31 tmp28        [V31,T13] (  2,  2   )  simd32  ->  mm1         "field V15._upper (fldOffset=0x20)" P-INDEP
 ;  V32 cse0         [V32,T09] (  3,  3   )  simd32  ->  mm2         "CSE #02: moderate"
 ;
 ; Lcl frame size = 64
 
 G_M3741_IG01:
        push     rbp
        sub      rsp, 64
        lea      rbp, [rsp+0x40]
 						;; size=10 bbWeight=0.25 PerfScore 0.44
 G_M3741_IG02:
        xor      eax, eax
        align    [0 bytes for IG03]
 						;; size=2 bbWeight=0.25 PerfScore 0.06
 G_M3741_IG03:
        lea      rcx, [rbp-0x40]
        movsxd   r8, eax
        mov      word  ptr [rcx+2*r8], ax
        inc      eax
        cmp      eax, 32
        jl       SHORT G_M3741_IG03
 						;; size=19 bbWeight=4 PerfScore 13.00
 G_M3741_IG04:
        vmovups  ymm0, ymmword ptr [rbp-0x40]
        vmovups  ymm1, ymmword ptr [rbp-0x20]
-       movsx    rax, dx
-       vmovd    xmm2, eax
+       vmovd    xmm2, edx
        vpbroadcastw ymm2, ymm2
        vpmullw  ymm0, ymm2, ymm0
        vpmullw  ymm1, ymm2, ymm1
-       movsx    rax, si
-       vmovd    xmm2, eax
+       vmovd    xmm2, esi
        vpbroadcastw ymm2, ymm2
        vpaddw   ymm0, ymm2, ymm0
        vpaddw   ymm1, ymm2, ymm1
        vmovups  ymmword ptr [rdi], ymm0
        vmovups  ymmword ptr [rdi+0x20], ymm1
        mov      rax, rdi
-						;; size=64 bbWeight=1 PerfScore 29.42
+						;; size=56 bbWeight=1 PerfScore 28.92
 G_M3741_IG05:
        vzeroupper 
        add      rsp, 64
        pop      rbp
        ret      
 						;; size=9 bbWeight=1 PerfScore 2.75
 
-; Total bytes of code 104, prolog size 10, PerfScore 45.67, instruction count 30, allocated bytes for code 104 (MethodHash=ecfef162) for method System.Runtime.Intrinsics.Vector512:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector512`1[short] (FullOpts)
+; Total bytes of code 96, prolog size 10, PerfScore 45.17, instruction count 28, allocated bytes for code 96 (MethodHash=ecfef162) for method System.Runtime.Intrinsics.Vector512:CreateSequence[short](short,short):System.Runtime.Intrinsics.Vector512`1[short] (FullOpts)
 ; ============================================================
-7 (-6.09 % of base) - System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte]
 ; Assembly listing for method System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 4 single block inlinees; 2 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T02] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T03] (  3,  3   )   ubyte  ->  rsi         single-def
 ;  V02 arg1         [V02,T04] (  3,  2.25)   ubyte  ->  rdx         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V04 tmp1         [V04    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;  V05 tmp2         [V05    ] (  2,  5   )  simd32  ->  [rbp-0x30]  do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[ubyte]>
 ;  V06 tmp3         [V06,T00] (  5, 16.25)     int  ->  rax         "Inline stloc first use temp"
 ;  V07 tmp4         [V07    ] (  2,  8.50)  simd32  ->  [rbp-0x50]  do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
 ;  V08 tmp5         [V08    ] (  2,  8.50)  simd32  ->  [rbp-0x70]  do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
 ;* V09 tmp6         [V09    ] (  0,  0   )     int  ->  zero-ref    "impAppendStmt"
 ;* V10 tmp7         [V10    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline stloc first use temp"
 ;* V11 tmp8         [V11    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V12 tmp9         [V12    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V13 tmp10        [V13    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;  V14 cse0         [V14,T01] (  4, 16   )    long  ->  rdx         "CSE #01: aggressive"
 ;
 ; Lcl frame size = 112
 
 G_M16765_IG01:
        push     rbp
        sub      rsp, 112
        lea      rbp, [rsp+0x70]
 						;; size=10 bbWeight=0.25 PerfScore 0.44
 G_M16765_IG02:
        vmovups  ymm0, ymmword ptr [reloc @RWD00]
        vmovups  ymmword ptr [rbp-0x50], ymm0
-       movzx    rax, dl
-       vmovd    xmm0, eax
+       vmovd    xmm0, edx
        vpbroadcastb ymm0, ymm0
        vmovups  ymmword ptr [rbp-0x70], ymm0
        xor      eax, eax
        align    [0 bytes for IG03]
-						;; size=32 bbWeight=0.25 PerfScore 2.62
+						;; size=29 bbWeight=0.25 PerfScore 2.56
 G_M16765_IG03:
        lea      rcx, [rbp-0x50]
        movsxd   rdx, eax
        movzx    rcx, byte  ptr [rcx+rdx]
        lea      r8, [rbp-0x70]
        movzx    r8, byte  ptr [r8+rdx]
        imul     ecx, r8d
        lea      r8, [rbp-0x30]
        mov      byte  ptr [r8+rdx], cl
        inc      eax
        cmp      eax, 32
        jl       SHORT G_M16765_IG03
 						;; size=39 bbWeight=4 PerfScore 41.00
 G_M16765_IG04:
-       movzx    rax, sil
-       vmovd    xmm0, eax
+       vmovd    xmm0, esi
        vpbroadcastb ymm0, ymm0
        vpaddb   ymm0, ymm0, ymmword ptr [rbp-0x30]
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
-						;; size=25 bbWeight=1 PerfScore 8.50
+						;; size=21 bbWeight=1 PerfScore 8.25
 G_M16765_IG05:
        vzeroupper 
        add      rsp, 112
        pop      rbp
        ret      
 						;; size=9 bbWeight=1 PerfScore 2.75
 RWD00  	dq	0706050403020100h, 0F0E0D0C0B0A0908h, 1716151413121110h, 1F1E1D1C1B1A1918h
 
 
-; Total bytes of code 115, prolog size 10, PerfScore 55.31, instruction count 32, allocated bytes for code 115 (MethodHash=2b2fbe82) for method System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
+; Total bytes of code 108, prolog size 10, PerfScore 55.00, instruction count 30, allocated bytes for code 108 (MethodHash=2b2fbe82) for method System.Numerics.Vector:CreateSequence[ubyte](ubyte,ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
 ; ============================================================
-7 (-9.21 % of base) - System.Runtime.Intrinsics.Vector128:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector128`1[ubyte]
 ; Assembly listing for method System.Runtime.Intrinsics.Vector128:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector128`1[ubyte] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T01] (  3,  3   )   ubyte  ->  rsi         single-def
 ;  V02 arg1         [V02,T02] (  3,  3   )   ubyte  ->  rdx         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V04 tmp1         [V04,T03] (  3,  6   )  simd32  ->  mm0         "fgMakeTemp is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M32957_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M32957_IG02:
        vpmovzxbw ymm0, xmmword ptr [reloc @RWD00]
-       movzx    rax, dl
-       vmovd    xmm1, eax
+       vmovd    xmm1, edx
        vpbroadcastb xmm1, xmm1
        vpmovzxbw ymm1, ymm1
        vpmullw  ymm0, ymm1, ymm0
        vpand    ymm0, ymm0, ymmword ptr [reloc @RWD32]
        vpackuswb ymm0, ymm0, ymm0
        vpermq   ymm0, ymm0, -40
-       movzx    rax, sil
-       vmovd    xmm1, eax
+       vmovd    xmm1, esi
        vpbroadcastb xmm1, xmm1
        vpaddb   xmm0, xmm1, xmm0
        vmovups  xmmword ptr [rdi], xmm0
        mov      rax, rdi
-						;; size=72 bbWeight=1 PerfScore 24.08
+						;; size=65 bbWeight=1 PerfScore 23.58
 G_M32957_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 RWD00  	dq	0706050403020100h, 0F0E0D0C0B0A0908h
 RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
 RWD32  	dq	00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh
 
 
-; Total bytes of code 76, prolog size 0, PerfScore 26.08, instruction count 17, allocated bytes for code 76 (MethodHash=ce707f42) for method System.Runtime.Intrinsics.Vector128:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector128`1[ubyte] (FullOpts)
+; Total bytes of code 69, prolog size 0, PerfScore 25.58, instruction count 15, allocated bytes for code 69 (MethodHash=ce707f42) for method System.Runtime.Intrinsics.Vector128:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector128`1[ubyte] (FullOpts)
 ; ============================================================
-7 (-6.25 % of base) - System.Runtime.Intrinsics.Vector256:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector256`1[ubyte]
 ; Assembly listing for method System.Runtime.Intrinsics.Vector256:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T01] (  3,  3   )   ubyte  ->  rsi         single-def
 ;  V02 arg1         [V02,T02] (  3,  3   )   ubyte  ->  rdx         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V04 tmp1         [V04,T03] (  3,  6   )  simd32  ->  mm1         "fgMakeTemp is creating a new local variable"
 ;  V05 cse0         [V05,T04] (  3,  3   )  simd32  ->  mm2         "CSE #01: aggressive"
 ;
 ; Lcl frame size = 0
 
 G_M8317_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M8317_IG02:
        vpmovzxbw ymm0, xmmword ptr [reloc @RWD00]
-       movzx    rax, dl
-       vmovd    xmm1, eax
+       vmovd    xmm1, edx
        vpbroadcastb ymm1, ymm1
        vmovaps  ymm2, ymm1
        vpmovzxbw ymm2, ymm2
        vpmullw  ymm0, ymm2, ymm0
        vmovups  ymm2, ymmword ptr [reloc @RWD32]
        vpand    ymm0, ymm0, ymm2
        vpmovzxbw ymm3, xmmword ptr [reloc @RWD64]
        vextracti128 xmm1, ymm1, 1
        vpmovzxbw ymm1, ymm1
        vpmullw  ymm1, ymm1, ymm3
        vpand    ymm1, ymm1, ymm2
        vpackuswb ymm0, ymm0, ymm1
        vpermq   ymm0, ymm0, -40
-       movzx    rax, sil
-       vmovd    xmm1, eax
+       vmovd    xmm1, esi
        vpbroadcastb ymm1, ymm1
        vpaddb   ymm0, ymm1, ymm0
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
-						;; size=108 bbWeight=1 PerfScore 41.00
+						;; size=101 bbWeight=1 PerfScore 40.50
 G_M8317_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 RWD00  	dq	0706050403020100h, 0F0E0D0C0B0A0908h
 RWD16  	dd	00000000h, 00000000h, 00000000h, 00000000h
 RWD32  	dq	00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh
 RWD64  	dq	1716151413121110h, 1F1E1D1C1B1A1918h
 
 
-; Total bytes of code 112, prolog size 0, PerfScore 43.00, instruction count 24, allocated bytes for code 112 (MethodHash=ef88df82) for method System.Runtime.Intrinsics.Vector256:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
+; Total bytes of code 105, prolog size 0, PerfScore 42.50, instruction count 22, allocated bytes for code 105 (MethodHash=ef88df82) for method System.Runtime.Intrinsics.Vector256:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
 ; ============================================================
-7 (-3.14 % of base) - System.Runtime.Intrinsics.Vector512:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector512`1[ubyte]
 ; Assembly listing for method System.Runtime.Intrinsics.Vector512:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 14 single block inlinees; 7 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T01] (  5,  5   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T02] (  3,  3   )   ubyte  ->  rsi         single-def
 ;  V02 arg1         [V02,T03] (  3,  3   )   ubyte  ->  rdx         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;* V04 tmp1         [V04    ] (  0,  0   )  struct (64) zero-ref    "spilled call-like call argument" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V05 tmp2         [V05    ] (  0,  0   )  struct (64) zero-ref    "impAppendStmt" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V06 tmp3         [V06    ] (  0,  0   )  struct (64) zero-ref    "spilled call-like call argument" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V07 tmp4         [V07    ] (  3,  6   )  struct (64) [rbp-0x40]  do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V08 tmp5         [V08,T00] (  6, 20.25)     int  ->  rax         "Inline stloc first use temp"
 ;* V09 tmp6         [V09    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V10 tmp7         [V10    ] (  0,  0   )  simd32  ->  zero-ref    "fgMakeTemp is creating a new local variable"
 ;  V11 tmp8         [V11,T04] (  3,  6   )  simd32  ->  mm4         "fgMakeTemp is creating a new local variable"
 ;* V12 tmp9         [V12    ] (  0,  0   )  simd32  ->  zero-ref    "fgMakeTemp is creating a new local variable"
 ;  V13 tmp10        [V13,T07] (  2,  4   )  simd32  ->  mm3         "fgMakeTemp is creating a new local variable"
 ;* V14 tmp11        [V14    ] (  0,  0   )  struct (64) zero-ref    ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V15 tmp12        [V15,T08] (  2,  4   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V16 tmp13        [V16,T09] (  2,  4   )  simd32  ->  mm1         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V17 tmp14        [V17,T05] (  3,  6   )  simd32  ->  mm2         "dup spill"
 ;* V18 tmp15        [V18    ] (  0,  0   )  struct (64) zero-ref    ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V19 tmp16        [V19    ] (  0,  0   )  struct (64) zero-ref    ld-addr-op "Inline ldloca(s) first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V20 tmp17        [V20,T10] (  2,  4   )  simd32  ->  mm0         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V21 tmp18        [V21,T11] (  2,  4   )  simd32  ->  mm1         "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V22 tmp19        [V22,T12] (  3,  3   )  simd32  ->  mm0         "field V04._lower (fldOffset=0x0)" P-INDEP
 ;  V23 tmp20        [V23,T13] (  3,  3   )  simd32  ->  mm1         "field V04._upper (fldOffset=0x20)" P-INDEP
 ;* V24 tmp21        [V24    ] (  0,  0   )  simd32  ->  zero-ref    "field V05._lower (fldOffset=0x0)" P-INDEP
 ;* V25 tmp22        [V25    ] (  0,  0   )  simd32  ->  zero-ref    "field V05._upper (fldOffset=0x20)" P-INDEP
 ;* V26 tmp23        [V26    ] (  0,  0   )  simd32  ->  zero-ref    "field V06._lower (fldOffset=0x0)" P-INDEP
 ;* V27 tmp24        [V27    ] (  0,  0   )  simd32  ->  zero-ref    "field V06._upper (fldOffset=0x20)" P-INDEP
 ;  V28 tmp25        [V28    ] (  2,  5   )  simd32  ->  [rbp-0x40]  do-not-enreg[XS] addr-exposed "field V07._lower (fldOffset=0x0)" P-DEP
 ;  V29 tmp26        [V29    ] (  2,  5   )  simd32  ->  [rbp-0x20]  do-not-enreg[XS] addr-exposed "field V07._upper (fldOffset=0x20)" P-DEP
 ;* V30 tmp27        [V30    ] (  0,  0   )  simd32  ->  zero-ref    "field V14._lower (fldOffset=0x0)" P-INDEP
 ;* V31 tmp28        [V31    ] (  0,  0   )  simd32  ->  zero-ref    "field V14._upper (fldOffset=0x20)" P-INDEP
 ;* V32 tmp29        [V32    ] (  0,  0   )  simd32  ->  zero-ref    "field V18._lower (fldOffset=0x0)" P-INDEP
 ;* V33 tmp30        [V33    ] (  0,  0   )  simd32  ->  zero-ref    "field V18._upper (fldOffset=0x20)" P-INDEP
 ;  V34 tmp31        [V34,T16] (  2,  2   )  simd32  ->  mm0         "field V19._lower (fldOffset=0x0)" P-INDEP
 ;  V35 tmp32        [V35,T17] (  2,  2   )  simd32  ->  mm1         "field V19._upper (fldOffset=0x20)" P-INDEP
 ;  V36 cse0         [V36,T14] (  3,  3   )  simd32  ->  mm3         "CSE #03: moderate"
 ;  V37 cse1         [V37,T06] (  5,  5   )  simd32  ->  mm5         "CSE #01: moderate"
 ;  V38 cse2         [V38,T15] (  3,  3   )  simd32  ->  mm4         "CSE #05: moderate"
 ;
 ; Lcl frame size = 64
 
 G_M18333_IG01:
        push     rbp
        sub      rsp, 64
        lea      rbp, [rsp+0x40]
 						;; size=10 bbWeight=0.25 PerfScore 0.44
 G_M18333_IG02:
        xor      eax, eax
        align    [0 bytes for IG03]
 						;; size=2 bbWeight=0.25 PerfScore 0.06
 G_M18333_IG03:
        lea      rcx, [rbp-0x40]
        movsxd   r8, eax
        mov      byte  ptr [rcx+r8], al
        inc      eax
        cmp      eax, 64
        jl       SHORT G_M18333_IG03
 						;; size=18 bbWeight=4 PerfScore 13.00
 G_M18333_IG04:
        vmovups  ymm0, ymmword ptr [rbp-0x40]
        vmovups  ymm1, ymmword ptr [rbp-0x20]
        vmovaps  ymm2, ymm0
        vpmovzxbw ymm2, ymm2
-       movzx    rax, dl
-       vmovd    xmm3, eax
+       vmovd    xmm3, edx
        vpbroadcastb ymm3, ymm3
        vmovaps  ymm4, ymm3
        vmovaps  ymm5, ymm4
        vpmovzxbw ymm5, ymm5
        vpmullw  ymm2, ymm5, ymm2
        vmovups  ymm5, ymmword ptr [reloc @RWD00]
        vpand    ymm2, ymm2, ymm5
        vextracti128 xmm0, ymm0, 1
        vpmovzxbw ymm0, ymm0
        vextracti128 xmm4, ymm4, 1
        vpmovzxbw ymm4, ymm4
        vpmullw  ymm0, ymm0, ymm4
        vpand    ymm0, ymm0, ymm5
        vpackuswb ymm0, ymm2, ymm0
        vpermq   ymm0, ymm0, -40
        vmovaps  ymm2, ymm1
        vpmovzxbw ymm2, ymm2
        vpmovzxbw ymm3, ymm3
        vpmullw  ymm2, ymm3, ymm2
        vpand    ymm2, ymm2, ymm5
        vextracti128 xmm1, ymm1, 1
        vpmovzxbw ymm1, ymm1
        vpmullw  ymm1, ymm1, ymm4
        vpand    ymm1, ymm1, ymm5
        vpackuswb ymm1, ymm2, ymm1
        vpermq   ymm1, ymm1, -40
-       movzx    rax, sil
-       vmovd    xmm2, eax
+       vmovd    xmm2, esi
        vpbroadcastb ymm2, ymm2
        vpaddb   ymm0, ymm2, ymm0
        vpaddb   ymm1, ymm2, ymm1
        vmovups  ymmword ptr [rdi], ymm0
        vmovups  ymmword ptr [rdi+0x20], ymm1
        mov      rax, rdi
-						;; size=184 bbWeight=1 PerfScore 69.75
+						;; size=177 bbWeight=1 PerfScore 69.25
 G_M18333_IG05:
        vzeroupper 
        add      rsp, 64
        pop      rbp
        ret      
 						;; size=9 bbWeight=1 PerfScore 2.75
 RWD00  	dq	00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh
 
 
-; Total bytes of code 223, prolog size 10, PerfScore 86.00, instruction count 55, allocated bytes for code 223 (MethodHash=434cb862) for method System.Runtime.Intrinsics.Vector512:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
+; Total bytes of code 216, prolog size 10, PerfScore 85.50, instruction count 53, allocated bytes for code 216 (MethodHash=434cb862) for method System.Runtime.Intrinsics.Vector512:CreateSequence[ubyte](ubyte,ubyte):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
 ; ============================================================
-6 (-1.64 % of base) - System.PackedSpanHelpers:IndexOfAnyInRange[System.SpanHelpers+DontNegate`1[short]](byref,short,short,int):int
 ; Assembly listing for method System.PackedSpanHelpers:IndexOfAnyInRange[System.SpanHelpers+DontNegate`1[short]](byref,short,short,int):int (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 7 single block inlinees; 5 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T02] ( 11, 10   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T04] (  5,  3.50)   short  ->  rsi         single-def
 ;  V02 arg2         [V02,T05] (  5,  3.50)   short  ->  rdx         single-def
 ;  V03 arg3         [V03,T03] ( 10, 10   )     int  ->  rcx         single-def
 ;  V04 loc0         [V04,T08] (  2,  4.50)     int  ->  rax        
 ;  V05 loc1         [V05,T09] (  2,  4.50)     int  ->  rdx        
 ;  V06 loc2         [V06,T01] (  6, 17   )     int  ->  rsi        
 ;  V07 loc3         [V07,T00] ( 10, 22.50)   byref  ->  rax        
 ;* V08 loc4         [V08    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V09 loc5         [V09    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V10 loc6         [V10    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V11 loc7         [V11    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V12 loc8         [V12    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;* V13 loc9         [V13    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V14 loc10        [V14    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V15 loc11        [V15    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[short]>
 ;* V16 loc12        [V16    ] (  0,  0   )  struct (64) zero-ref    <System.Runtime.Intrinsics.Vector512`1[ubyte]>
 ;  V17 loc13        [V17,T20] (  3,  5   )  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V18 loc14        [V18,T21] (  3,  5   )  simd32  ->  mm1         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V19 loc15        [V19,T07] (  2,  4.50)   byref  ->  rdx         single-def
 ;* V20 loc16        [V20    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V21 loc17        [V21,T19] (  4, 12.50)  simd32  ->  mm2         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V22 loc18        [V22,T12] (  5,  2.50)   byref  ->  rcx         single-def
 ;* V23 loc19        [V23    ] (  0,  0   )   byref  ->  zero-ref    single-def
 ;* V24 loc20        [V24    ] (  0,  0   )  simd32  ->  zero-ref    <System.Runtime.Intrinsics.Vector256`1[short]>
 ;  V25 loc21        [V25,T24] (  4,  2   )  simd32  ->  mm0         <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;  V26 loc22        [V26,T26] (  2,  1   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V27 loc23        [V27,T27] (  2,  1   )  simd16  ->  mm1         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;* V28 loc24        [V28    ] (  0,  0   )   byref  ->  zero-ref   
 ;* V29 loc25        [V29    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;* V30 loc26        [V30    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V31 loc27        [V31,T13] (  5,  2.50)   byref  ->  rax         single-def
 ;* V32 loc28        [V32    ] (  0,  0   )   byref  ->  zero-ref    single-def
 ;* V33 loc29        [V33    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[short]>
 ;  V34 loc30        [V34,T25] (  4,  2   )  simd16  ->  mm0         <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;# V35 OutArgs      [V35    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V36 tmp1         [V36,T16] (  3,  1.50)   byref  ->  rcx        
+;  V36 tmp1         [V36,T16] (  3,  1.50)   byref  ->  rdx        
 ;* V37 tmp2         [V37    ] (  0,  0   )  simd16  ->  zero-ref    "impAppendStmt"
 ;* V38 tmp3         [V38    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
 ;  V39 tmp4         [V39,T22] (  3,  3   )  simd16  ->  mm0         "fgMakeTemp is creating a new local variable"
 ;  V40 tmp5         [V40,T17] (  3,  1.50)   byref  ->  rax        
 ;* V41 tmp6         [V41    ] (  0,  0   )  simd32  ->  zero-ref    "impAppendStmt"
 ;* V42 tmp7         [V42    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;  V43 tmp8         [V43,T23] (  3,  3   )  simd32  ->  mm0         "fgMakeTemp is creating a new local variable"
 ;* V44 tmp9         [V44    ] (  0,  0   )  simd32  ->  zero-ref    "impAppendStmt"
 ;* V45 tmp10        [V45    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;  V46 tmp11        [V46,T18] (  3, 24   )  simd32  ->  mm2         "fgMakeTemp is creating a new local variable"
 ;* V47 tmp12        [V47    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V48 tmp13        [V48    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V49 tmp14        [V49    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;* V50 tmp15        [V50    ] (  0,  0   )  simd32  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
 ;* V51 tmp16        [V51    ] (  0,  0   )  simd32  ->  zero-ref    "spilled call-like call argument"
 ;  V52 tmp17        [V52,T14] (  5,  2.50)     int  ->  rdx         "Inline stloc first use temp"
 ;  V53 tmp18        [V53,T10] (  3,  3   )   byref  ->  rax         "Inlining Arg"
 ;* V54 tmp19        [V54    ] (  0,  0   )  simd16  ->  zero-ref    "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;  V55 tmp20        [V55,T15] (  5,  2.50)     int  ->  rdx         "Inline stloc first use temp"
-;  V56 tmp21        [V56,T11] (  3,  3   )   byref  ->  rcx         "Inlining Arg"
+;  V55 tmp20        [V55,T15] (  5,  2.50)     int  ->  rcx         "Inline stloc first use temp"
+;  V56 tmp21        [V56,T11] (  3,  3   )   byref  ->  rdx         "Inlining Arg"
 ;  V57 tmp22        [V57,T06] (  5,  5   )     int  ->  rax         "Single return block return value"
 ;* V58 tmp23        [V58    ] (  0,  0   )  simd32  ->  zero-ref    "field V08._lower (fldOffset=0x0)" P-INDEP
 ;* V59 tmp24        [V59    ] (  0,  0   )  simd32  ->  zero-ref    "field V08._upper (fldOffset=0x20)" P-INDEP
 ;* V60 tmp25        [V60    ] (  0,  0   )  simd32  ->  zero-ref    "field V09._lower (fldOffset=0x0)" P-INDEP
 ;* V61 tmp26        [V61    ] (  0,  0   )  simd32  ->  zero-ref    "field V09._upper (fldOffset=0x20)" P-INDEP
 ;* V62 tmp27        [V62    ] (  0,  0   )  simd32  ->  zero-ref    "field V11._lower (fldOffset=0x0)" P-INDEP
 ;* V63 tmp28        [V63    ] (  0,  0   )  simd32  ->  zero-ref    "field V11._upper (fldOffset=0x20)" P-INDEP
 ;* V64 tmp29        [V64    ] (  0,  0   )  simd32  ->  zero-ref    "field V12._lower (fldOffset=0x0)" P-INDEP
 ;* V65 tmp30        [V65    ] (  0,  0   )  simd32  ->  zero-ref    "field V12._upper (fldOffset=0x20)" P-INDEP
 ;* V66 tmp31        [V66    ] (  0,  0   )  simd32  ->  zero-ref    "field V15._lower (fldOffset=0x0)" P-INDEP
 ;* V67 tmp32        [V67    ] (  0,  0   )  simd32  ->  zero-ref    "field V15._upper (fldOffset=0x20)" P-INDEP
 ;* V68 tmp33        [V68    ] (  0,  0   )  simd32  ->  zero-ref    "field V16._lower (fldOffset=0x0)" P-INDEP
 ;* V69 tmp34        [V69    ] (  0,  0   )  simd32  ->  zero-ref    "field V16._upper (fldOffset=0x20)" P-INDEP
 ;
 ; Lcl frame size = 0
 
 G_M62233_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M62233_IG02:
        cmp      ecx, 8
        jl       G_M62233_IG10
 						;; size=9 bbWeight=1 PerfScore 1.25
 G_M62233_IG03:
        mov      rax, rdi
        cmp      ecx, 16
        jg       SHORT G_M62233_IG05
-       movzx    rsi, sil
        vmovd    xmm0, esi
        vpbroadcastb xmm0, xmm0
-       movzx    rdx, dl
        vmovd    xmm1, edx
        vpbroadcastb xmm1, xmm1
        add      ecx, -8
-       movsxd   rax, ecx
-       lea      rax, bword ptr [rdi+2*rax]
+       movsxd   rsi, ecx
+       lea      rax, bword ptr [rdi+2*rsi]
        cmp      rdi, rax
-       mov      rcx, rdi
-       cmova    rcx, rax
-       vmovups  xmm2, xmmword ptr [rcx]
+       mov      rdx, rdi
+       cmova    rdx, rax
+       vmovups  xmm2, xmmword ptr [rdx]
        vpackuswb xmm2, xmm2, xmmword ptr [rax]
        vpsubb   xmm0, xmm2, xmm0
        vpminub  xmm1, xmm0, xmm1
        vpcmpeqb xmm0, xmm1, xmm0
        vptest   xmm0, xmm0
        je       G_M62233_IG12
-       vpmovmskb edx, xmm0
-       tzcnt    edx, edx
-       cmp      edx, 8
+       vpmovmskb ecx, xmm0
+       tzcnt    ecx, ecx
+       cmp      ecx, 8
        jl       SHORT G_M62233_IG04
-       mov      rcx, rax
-       add      edx, -8
-						;; size=103 bbWeight=0.50 PerfScore 13.92
+       mov      rdx, rax
+       add      ecx, -8
+						;; size=96 bbWeight=0.50 PerfScore 13.67
 G_M62233_IG04:
-       sub      rcx, rdi
-       shr      rcx, 1
-       lea      eax, [rcx+rdx]
+       sub      rdx, rdi
+       shr      rdx, 1
+       lea      eax, [rdx+rcx]
        jmp      G_M62233_IG15
-       align    [0 bytes for IG06]
-						;; size=14 bbWeight=0.50 PerfScore 1.62
+       align    [4 bytes for IG06]
+						;; size=18 bbWeight=0.50 PerfScore 1.62
 G_M62233_IG05:
-       movzx    rsi, sil
        vmovd    xmm0, esi
        vpbroadcastb ymm0, ymm0
-       movzx    rdx, dl
        vmovd    xmm1, edx
        vpbroadcastb ymm1, ymm1
        cmp      ecx, 32
        jle      SHORT G_M62233_IG07
-       lea      edx, [rcx-0x20]
-       movsxd   rdx, edx
+       lea      esi, [rcx-0x20]
+       movsxd   rdx, esi
        lea      rdx, bword ptr [rax+2*rdx]
-						;; size=40 bbWeight=0.50 PerfScore 5.50
+						;; size=33 bbWeight=0.50 PerfScore 5.25
 G_M62233_IG06:
        vmovups  ymm2, ymmword ptr [rax]
        vpackuswb ymm2, ymm2, ymmword ptr [rax+0x20]
        vpsubb   ymm2, ymm2, ymm0
        vpminub  ymm3, ymm2, ymm1
        vpcmpeqb ymm2, ymm3, ymm2
        vptest   ymm2, ymm2
        jne      SHORT G_M62233_IG09
        add      rax, 64
        cmp      rax, rdx
        jb       SHORT G_M62233_IG06
 						;; size=37 bbWeight=4 PerfScore 67.33
 G_M62233_IG07:
        add      ecx, -16
        movsxd   rcx, ecx
        lea      rcx, bword ptr [rdi+2*rcx]
        cmp      rax, rcx
        cmova    rax, rcx
        vmovups  ymm2, ymmword ptr [rax]
        vpackuswb ymm2, ymm2, ymmword ptr [rcx]
        vpsubb   ymm0, ymm2, ymm0
        vpminub  ymm1, ymm1, ymm0
        vpcmpeqb ymm0, ymm1, ymm0
        vptest   ymm0, ymm0
        je       SHORT G_M62233_IG12
        vpermq   ymm0, ymm0, -40
        vpmovmskb edx, ymm0
        tzcnt    edx, edx
        cmp      edx, 16
        jl       SHORT G_M62233_IG08
        mov      rax, rcx
        add      edx, -16
 						;; size=69 bbWeight=0.50 PerfScore 12.79
 G_M62233_IG08:
        sub      rax, rdi
        shr      rax, 1
        add      eax, edx
        jmp      SHORT G_M62233_IG15
-       align    [0 bytes for IG11]
-						;; size=10 bbWeight=0.50 PerfScore 1.50
+       align    [4 bytes for IG11]
+						;; size=14 bbWeight=0.50 PerfScore 1.50
 G_M62233_IG09:
        sub      rax, rdi
        shr      rax, 1
        vpermq   ymm0, ymm2, -40
        vpmovmskb edi, ymm0
        xor      ecx, ecx
        tzcnt    ecx, edi
        add      eax, ecx
        jmp      SHORT G_M62233_IG15
 						;; size=26 bbWeight=0.50 PerfScore 5.12
 G_M62233_IG10:
        movsx    rax, si
        movsx    rdx, dx
        xor      esi, esi
        test     ecx, ecx
        jle      SHORT G_M62233_IG12
 						;; size=14 bbWeight=0.50 PerfScore 1.00
 G_M62233_IG11:
        movsxd   r8, esi
        movsx    r8, word  ptr [rdi+2*r8]
        sub      r8d, eax
        cmp      r8d, edx
        jbe      SHORT G_M62233_IG14
        inc      esi
        cmp      esi, ecx
        jl       SHORT G_M62233_IG11
 						;; size=22 bbWeight=4 PerfScore 29.00
 G_M62233_IG12:
        mov      eax, -1
 						;; size=5 bbWeight=0.50 PerfScore 0.12
 G_M62233_IG13:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M62233_IG14:
        mov      eax, esi
 						;; size=2 bbWeight=0.50 PerfScore 0.12
 G_M62233_IG15:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 
-; Total bytes of code 365, prolog size 4, PerfScore 143.04, instruction count 111, allocated bytes for code 369 (MethodHash=1fbe0ce6) for method System.PackedSpanHelpers:IndexOfAnyInRange[System.SpanHelpers+DontNegate`1[short]](byref,short,short,int):int (FullOpts)
+; Total bytes of code 359, prolog size 4, PerfScore 142.54, instruction count 107, allocated bytes for code 359 (MethodHash=1fbe0ce6) for method System.PackedSpanHelpers:IndexOfAnyInRange[System.SpanHelpers+DontNegate`1[short]](byref,short,short,int):int (FullOpts)
 ; ============================================================
-5 (-2.31 % of base) - System.SpanHelpers:Fill[short](byref,ulong,short)
 ; Assembly listing for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 1 single block inlinees; 1 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T01] ( 18, 38   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T06] ( 10,  6.50)    long  ->  rsi         single-def
 ;  V02 arg2         [V02,T02] ( 18, 38   )   short  ->  rdx         single-def
 ;  V03 loc0         [V03,T00] ( 23, 50   )    long  ->  rax        
 ;* V04 loc1         [V04    ] (  0,  0   )   short  ->  zero-ref    ld-addr-op
 ;  V05 loc2         [V05,T10] (  5,  9.50)  simd32  ->  mm0         ld-addr-op <System.Numerics.Vector`1[ubyte]>
 ;  V06 loc3         [V06,T05] (  5,  9.50)   byref  ->  rdi         single-def
 ;  V07 loc4         [V07,T09] (  4,  2   )    long  ->  rax        
 ;  V08 loc5         [V08,T07] (  2,  4.50)    long  ->  rcx        
 ;  V09 loc6         [V09,T03] (  7, 21   )    long  ->  rdx        
 ;* V10 loc7         [V10    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V11 loc8         [V11,T08] (  2,  4.50)    long  ->  rcx        
 ;# V12 OutArgs      [V12    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V13 tmp1         [V13,T04] (  2, 16   )    long  ->  rax         "dup spill"
 ;* V14 tmp2         [V14    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[ushort]>
 ;* V15 tmp3         [V15    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V16 tmp4         [V16    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
 ;
 ; Lcl frame size = 0
 
 G_M24463_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M24463_IG02:
        cmp      rsi, 16
        jae      G_M24463_IG09
 						;; size=10 bbWeight=1 PerfScore 1.25
 G_M24463_IG03:
        xor      eax, eax
        cmp      rsi, 8
        jb       SHORT G_M24463_IG05
        mov      rcx, rsi
        and      rcx, -8
        align    [3 bytes for IG04]
 						;; size=18 bbWeight=0.50 PerfScore 1.12
 G_M24463_IG04:
        mov      word  ptr [rdi+2*rax], dx
        mov      word  ptr [rdi+2*rax+0x02], dx
        mov      word  ptr [rdi+2*rax+0x04], dx
        mov      word  ptr [rdi+2*rax+0x06], dx
        mov      word  ptr [rdi+2*rax+0x08], dx
        mov      word  ptr [rdi+2*rax+0x0A], dx
        mov      word  ptr [rdi+2*rax+0x0C], dx
        mov      word  ptr [rdi+2*rax+0x0E], dx
        add      rax, 8
        cmp      rax, rcx
        jb       SHORT G_M24463_IG04
 						;; size=48 bbWeight=4 PerfScore 38.00
 G_M24463_IG05:
        test     sil, 4
        je       SHORT G_M24463_IG06
        mov      word  ptr [rdi+2*rax], dx
        mov      word  ptr [rdi+2*rax+0x02], dx
        mov      word  ptr [rdi+2*rax+0x04], dx
        mov      word  ptr [rdi+2*rax+0x06], dx
        add      rax, 4
 						;; size=29 bbWeight=0.50 PerfScore 2.75
 G_M24463_IG06:
        test     sil, 2
        je       SHORT G_M24463_IG07
        mov      word  ptr [rdi+2*rax], dx
        mov      word  ptr [rdi+2*rax+0x02], dx
        add      rax, 2
 						;; size=19 bbWeight=0.50 PerfScore 1.75
 G_M24463_IG07:
        test     sil, 1
        je       SHORT G_M24463_IG08
        mov      word  ptr [rdi+2*rax], dx
 						;; size=10 bbWeight=0.50 PerfScore 1.12
 G_M24463_IG08:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M24463_IG09:
-       movzx    rax, dx
-       vmovd    xmm0, eax
+       vmovd    xmm0, edx
        vpbroadcastw ymm0, ymm0
        lea      rax, [rsi+rsi]
        mov      rcx, rax
        and      rcx, -64
        xor      edx, edx
        cmp      rsi, 32
        jb       SHORT G_M24463_IG11
-       align    [2 bytes for IG10]
-						;; size=33 bbWeight=0.50 PerfScore 3.50
+       align    [0 bytes for IG10]
+						;; size=28 bbWeight=0.50 PerfScore 3.25
 G_M24463_IG10:
        vmovups  ymmword ptr [rdi+rdx], ymm0
        vmovups  ymmword ptr [rdi+rdx+0x20], ymm0
        add      rdx, 64
        cmp      rdx, rcx
        jb       SHORT G_M24463_IG10
 						;; size=20 bbWeight=4 PerfScore 22.00
 G_M24463_IG11:
        test     al, 32
        je       SHORT G_M24463_IG12
        vmovups  ymmword ptr [rdi+rdx], ymm0
 						;; size=9 bbWeight=0.50 PerfScore 1.62
 G_M24463_IG12:
        vmovups  ymmword ptr [rdi+rax-0x20], ymm0
 						;; size=6 bbWeight=0.50 PerfScore 1.00
 G_M24463_IG13:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 
-; Total bytes of code 216, prolog size 4, PerfScore 77.88, instruction count 61, allocated bytes for code 216 (MethodHash=7507a070) for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts)
+; Total bytes of code 211, prolog size 4, PerfScore 77.62, instruction count 60, allocated bytes for code 211 (MethodHash=7507a070) for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts)
 ; ============================================================
-5 (-2.31 % of base) - System.SpanHelpers:Fill[ushort](byref,ulong,ushort)
 ; Assembly listing for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 1 single block inlinees; 1 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T01] ( 18, 38   )   byref  ->  rdi         single-def
 ;  V01 arg1         [V01,T06] ( 10,  6.50)    long  ->  rsi         single-def
 ;  V02 arg2         [V02,T02] ( 18, 38   )  ushort  ->  rdx         single-def
 ;  V03 loc0         [V03,T00] ( 23, 50   )    long  ->  rax        
 ;* V04 loc1         [V04    ] (  0,  0   )  ushort  ->  zero-ref    ld-addr-op
 ;  V05 loc2         [V05,T10] (  5,  9.50)  simd32  ->  mm0         ld-addr-op <System.Numerics.Vector`1[ubyte]>
 ;  V06 loc3         [V06,T05] (  5,  9.50)   byref  ->  rdi         single-def
 ;  V07 loc4         [V07,T09] (  4,  2   )    long  ->  rax        
 ;  V08 loc5         [V08,T07] (  2,  4.50)    long  ->  rcx        
 ;  V09 loc6         [V09,T03] (  7, 21   )    long  ->  rdx        
 ;* V10 loc7         [V10    ] (  0,  0   )  simd16  ->  zero-ref    <System.Runtime.Intrinsics.Vector128`1[ubyte]>
 ;  V11 loc8         [V11,T08] (  2,  4.50)    long  ->  rcx        
 ;# V12 OutArgs      [V12    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V13 tmp1         [V13,T04] (  2, 16   )    long  ->  rax         "dup spill"
 ;* V14 tmp2         [V14    ] (  0,  0   )  simd32  ->  zero-ref    ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[ushort]>
 ;* V15 tmp3         [V15    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V16 tmp4         [V16    ] (  0,  0   )  ushort  ->  zero-ref    "Inlining Arg"
 ;
 ; Lcl frame size = 0
 
 G_M51983_IG01:
        push     rbp
        mov      rbp, rsp
 						;; size=4 bbWeight=1 PerfScore 1.25
 G_M51983_IG02:
        cmp      rsi, 16
        jae      G_M51983_IG09
 						;; size=10 bbWeight=1 PerfScore 1.25
 G_M51983_IG03:
        xor      eax, eax
        cmp      rsi, 8
        jb       SHORT G_M51983_IG05
        mov      rcx, rsi
        and      rcx, -8
        align    [3 bytes for IG04]
 						;; size=18 bbWeight=0.50 PerfScore 1.12
 G_M51983_IG04:
        mov      word  ptr [rdi+2*rax], dx
        mov      word  ptr [rdi+2*rax+0x02], dx
        mov      word  ptr [rdi+2*rax+0x04], dx
        mov      word  ptr [rdi+2*rax+0x06], dx
        mov      word  ptr [rdi+2*rax+0x08], dx
        mov      word  ptr [rdi+2*rax+0x0A], dx
        mov      word  ptr [rdi+2*rax+0x0C], dx
        mov      word  ptr [rdi+2*rax+0x0E], dx
        add      rax, 8
        cmp      rax, rcx
        jb       SHORT G_M51983_IG04
 						;; size=48 bbWeight=4 PerfScore 38.00
 G_M51983_IG05:
        test     sil, 4
        je       SHORT G_M51983_IG06
        mov      word  ptr [rdi+2*rax], dx
        mov      word  ptr [rdi+2*rax+0x02], dx
        mov      word  ptr [rdi+2*rax+0x04], dx
        mov      word  ptr [rdi+2*rax+0x06], dx
        add      rax, 4
 						;; size=29 bbWeight=0.50 PerfScore 2.75
 G_M51983_IG06:
        test     sil, 2
        je       SHORT G_M51983_IG07
        mov      word  ptr [rdi+2*rax], dx
        mov      word  ptr [rdi+2*rax+0x02], dx
        add      rax, 2
 						;; size=19 bbWeight=0.50 PerfScore 1.75
 G_M51983_IG07:
        test     sil, 1
        je       SHORT G_M51983_IG08
        mov      word  ptr [rdi+2*rax], dx
 						;; size=10 bbWeight=0.50 PerfScore 1.12
 G_M51983_IG08:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 G_M51983_IG09:
-       movzx    rax, dx
-       vmovd    xmm0, eax
+       vmovd    xmm0, edx
        vpbroadcastw ymm0, ymm0
        lea      rax, [rsi+rsi]
        mov      rcx, rax
        and      rcx, -64
        xor      edx, edx
        cmp      rsi, 32
        jb       SHORT G_M51983_IG11
-       align    [2 bytes for IG10]
-						;; size=33 bbWeight=0.50 PerfScore 3.50
+       align    [0 bytes for IG10]
+						;; size=28 bbWeight=0.50 PerfScore 3.25
 G_M51983_IG10:
        vmovups  ymmword ptr [rdi+rdx], ymm0
        vmovups  ymmword ptr [rdi+rdx+0x20], ymm0
        add      rdx, 64
        cmp      rdx, rcx
        jb       SHORT G_M51983_IG10
 						;; size=20 bbWeight=4 PerfScore 22.00
 G_M51983_IG11:
        test     al, 32
        je       SHORT G_M51983_IG12
        vmovups  ymmword ptr [rdi+rdx], ymm0
 						;; size=9 bbWeight=0.50 PerfScore 1.62
 G_M51983_IG12:
        vmovups  ymmword ptr [rdi+rax-0x20], ymm0
 						;; size=6 bbWeight=0.50 PerfScore 1.00
 G_M51983_IG13:
        vzeroupper 
        pop      rbp
        ret      
 						;; size=5 bbWeight=0.50 PerfScore 1.25
 
-; Total bytes of code 216, prolog size 4, PerfScore 77.88, instruction count 61, allocated bytes for code 216 (MethodHash=267434f0) for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts)
+; Total bytes of code 211, prolog size 4, PerfScore 77.62, instruction count 60, allocated bytes for code 211 (MethodHash=267434f0) for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts)
 ; ============================================================
-4 (-13.33 % of base) - System.Numerics.Vector:Multiply[short](short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short]
 ; Assembly listing for method System.Numerics.Vector:Multiply[short](short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T01] (  3,  3   )   short  ->  rsi         single-def
 ;  V02 arg1         [V02,T02] (  1,  1   )  simd32  ->  [rsp+0x08]  single-def <System.Numerics.Vector`1[short]>
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;
 ; Lcl frame size = 0
 
 G_M33721_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M33721_IG02:
-       movsx    rax, si
-       vmovd    xmm0, eax
+       vmovd    xmm0, esi
        vpbroadcastw ymm0, ymm0
        vpmullw  ymm0, ymm0, ymmword ptr [rsp+0x08]
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
-						;; size=26 bbWeight=1 PerfScore 12.50
+						;; size=22 bbWeight=1 PerfScore 12.25
 G_M33721_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 
-; Total bytes of code 30, prolog size 0, PerfScore 14.50, instruction count 8, allocated bytes for code 30 (MethodHash=97457c46) for method System.Numerics.Vector:Multiply[short](short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 14.25, instruction count 7, allocated bytes for code 26 (MethodHash=97457c46) for method System.Numerics.Vector:Multiply[short](short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
 ; ============================================================
-4 (-13.33 % of base) - System.Numerics.Vector:Multiply[short](System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short]
 ; Assembly listing for method System.Numerics.Vector:Multiply[short](System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T02] (  1,  1   )  simd32  ->  [rsp+0x08]  single-def <System.Numerics.Vector`1[short]>
 ;  V02 arg1         [V02,T01] (  3,  3   )   short  ->  rsi         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;
 ; Lcl frame size = 0
 
 G_M16569_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M16569_IG02:
-       movsx    rax, si
-       vmovd    xmm0, eax
+       vmovd    xmm0, esi
        vpbroadcastw ymm0, ymm0
        vpmullw  ymm0, ymm0, ymmword ptr [rsp+0x08]
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
-						;; size=26 bbWeight=1 PerfScore 12.50
+						;; size=22 bbWeight=1 PerfScore 12.25
 G_M16569_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 
-; Total bytes of code 30, prolog size 0, PerfScore 14.50, instruction count 8, allocated bytes for code 30 (MethodHash=2ef4bf46) for method System.Numerics.Vector:Multiply[short](System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 14.25, instruction count 7, allocated bytes for code 26 (MethodHash=2ef4bf46) for method System.Numerics.Vector:Multiply[short](System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
 ; ============================================================
-4 (-4.08 % of base) - System.Numerics.Vector:Multiply[ubyte](System.Numerics.Vector`1[ubyte],ubyte):System.Numerics.Vector`1[ubyte]
 ; Assembly listing for method System.Numerics.Vector:Multiply[ubyte](System.Numerics.Vector`1[ubyte],ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 4 single block inlinees; 2 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T02] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T04] (  1,  0.25)  simd32  ->  [rbp+0x10]  single-def <System.Numerics.Vector`1[ubyte]>
 ;  V02 arg1         [V02,T03] (  3,  2.25)   ubyte  ->  rsi         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V04 tmp1         [V04    ] (  2,  5   )  simd32  ->  [rbp-0x30]  do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[ubyte]>
 ;  V05 tmp2         [V05,T00] (  5, 16.25)     int  ->  rax         "Inline stloc first use temp"
 ;  V06 tmp3         [V06    ] (  2,  8.50)  simd32  ->  [rbp-0x50]  do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
 ;  V07 tmp4         [V07    ] (  2,  8.50)  simd32  ->  [rbp-0x70]  do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
 ;* V08 tmp5         [V08    ] (  0,  0   )     int  ->  zero-ref    "impAppendStmt"
 ;* V09 tmp6         [V09    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline stloc first use temp"
 ;* V10 tmp7         [V10    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V11 tmp8         [V11    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V12 tmp9         [V12    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;  V13 cse0         [V13,T01] (  4, 16   )    long  ->  rdx         "CSE #01: aggressive"
 ;
 ; Lcl frame size = 112
 
 G_M18297_IG01:
        push     rbp
        sub      rsp, 112
        lea      rbp, [rsp+0x70]
 						;; size=10 bbWeight=0.25 PerfScore 0.44
 G_M18297_IG02:
        vmovups  ymm0, ymmword ptr [rbp+0x10]
        vmovups  ymmword ptr [rbp-0x50], ymm0
-       movzx    rax, sil
-       vmovd    xmm0, eax
+       vmovd    xmm0, esi
        vpbroadcastb ymm0, ymm0
        vmovups  ymmword ptr [rbp-0x70], ymm0
        xor      eax, eax
        align    [0 bytes for IG03]
-						;; size=30 bbWeight=0.25 PerfScore 2.62
+						;; size=26 bbWeight=0.25 PerfScore 2.56
 G_M18297_IG03:
        lea      rcx, [rbp-0x50]
        movsxd   rdx, eax
        movzx    rcx, byte  ptr [rcx+rdx]
        lea      rsi, [rbp-0x70]
        movzx    rsi, byte  ptr [rsi+rdx]
        imul     ecx, esi
        lea      rsi, [rbp-0x30]
        mov      byte  ptr [rsi+rdx], cl
        inc      eax
        cmp      eax, 32
        jl       SHORT G_M18297_IG03
 						;; size=37 bbWeight=4 PerfScore 41.00
 G_M18297_IG04:
        vmovups  ymm0, ymmword ptr [rbp-0x30]
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
 						;; size=12 bbWeight=1 PerfScore 6.25
 G_M18297_IG05:
        vzeroupper 
        add      rsp, 112
        pop      rbp
        ret      
 						;; size=9 bbWeight=1 PerfScore 2.75
 
-; Total bytes of code 98, prolog size 10, PerfScore 53.06, instruction count 29, allocated bytes for code 98 (MethodHash=9485b886) for method System.Numerics.Vector:Multiply[ubyte](System.Numerics.Vector`1[ubyte],ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
+; Total bytes of code 94, prolog size 10, PerfScore 53.00, instruction count 28, allocated bytes for code 94 (MethodHash=9485b886) for method System.Numerics.Vector:Multiply[ubyte](System.Numerics.Vector`1[ubyte],ubyte):System.Numerics.Vector`1[ubyte] (FullOpts)
 ; ============================================================
-4 (-4.08 % of base) - System.Numerics.Vector:Multiply[ubyte](ubyte,System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte]
 ; Assembly listing for method System.Numerics.Vector:Multiply[ubyte](ubyte,System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rbp based frame
 ; fully interruptible
 ; No PGO data
 ; 0 inlinees with PGO data; 4 single block inlinees; 2 inlinees without PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T02] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T03] (  3,  2.25)   ubyte  ->  rsi         single-def
 ;  V02 arg1         [V02,T04] (  1,  0.25)  simd32  ->  [rbp+0x10]  single-def <System.Numerics.Vector`1[ubyte]>
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V04 tmp1         [V04    ] (  2,  5   )  simd32  ->  [rbp-0x30]  do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[ubyte]>
 ;  V05 tmp2         [V05,T00] (  5, 16.25)     int  ->  rax         "Inline stloc first use temp"
 ;  V06 tmp3         [V06    ] (  2,  8.50)  simd32  ->  [rbp-0x50]  do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
 ;  V07 tmp4         [V07    ] (  2,  8.50)  simd32  ->  [rbp-0x70]  do-not-enreg[XS] addr-exposed ld-addr-op "Inlining Arg" <System.Numerics.Vector`1[ubyte]>
 ;* V08 tmp5         [V08    ] (  0,  0   )     int  ->  zero-ref    "impAppendStmt"
 ;* V09 tmp6         [V09    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline stloc first use temp"
 ;* V10 tmp7         [V10    ] (  0,  0   )   ubyte  ->  zero-ref    "Inline return value spill temp"
 ;* V11 tmp8         [V11    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;* V12 tmp9         [V12    ] (  0,  0   )   ubyte  ->  zero-ref    "Inlining Arg"
 ;  V13 cse0         [V13,T01] (  4, 16   )    long  ->  rdx         "CSE #01: aggressive"
 ;
 ; Lcl frame size = 112
 
 G_M24697_IG01:
        push     rbp
        sub      rsp, 112
        lea      rbp, [rsp+0x70]
 						;; size=10 bbWeight=0.25 PerfScore 0.44
 G_M24697_IG02:
        vmovups  ymm0, ymmword ptr [rbp+0x10]
        vmovups  ymmword ptr [rbp-0x50], ymm0
-       movzx    rax, sil
-       vmovd    xmm0, eax
+       vmovd    xmm0, esi
        vpbroadcastb ymm0, ymm0
        vmovups  ymmword ptr [rbp-0x70], ymm0
        xor      eax, eax
        align    [0 bytes for IG03]
-						;; size=30 bbWeight=0.25 PerfScore 2.62
+						;; size=26 bbWeight=0.25 PerfScore 2.56
 G_M24697_IG03:
        lea      rcx, [rbp-0x50]
        movsxd   rdx, eax
        movzx    rcx, byte  ptr [rcx+rdx]
        lea      rsi, [rbp-0x70]
        movzx    rsi, byte  ptr [rsi+rdx]
        imul     ecx, esi
        lea      rsi, [rbp-0x30]
        mov      byte  ptr [rsi+rdx], cl
        inc      eax
        cmp      eax, 32
        jl       SHORT G_M24697_IG03
 						;; size=37 bbWeight=4 PerfScore 41.00
 G_M24697_IG04:
        vmovups  ymm0, ymmword ptr [rbp-0x30]
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
 						;; size=12 bbWeight=1 PerfScore 6.25
 G_M24697_IG05:
        vzeroupper 
        add      rsp, 112
        pop      rbp
        ret      
 						;; size=9 bbWeight=1 PerfScore 2.75
 
-; Total bytes of code 98, prolog size 10, PerfScore 53.06, instruction count 29, allocated bytes for code 98 (MethodHash=1e1e9f86) for method System.Numerics.Vector:Multiply[ubyte](ubyte,System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
+; Total bytes of code 94, prolog size 10, PerfScore 53.00, instruction count 28, allocated bytes for code 94 (MethodHash=1e1e9f86) for method System.Numerics.Vector:Multiply[ubyte](ubyte,System.Numerics.Vector`1[ubyte]):System.Numerics.Vector`1[ubyte] (FullOpts)
 ; ============================================================
-4 (-19.05 % of base) - System.Numerics.Vector`1[short]:.ctor(short):this
 ; Assembly listing for method System.Numerics.Vector`1[short]:.ctor(short):this (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 this         [V00,T00] (  3,  3   )   byref  ->  rdi         this single-def
 ;  V01 arg1         [V01,T01] (  3,  3   )   short  ->  rsi         single-def
 ;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;
 ; Lcl frame size = 0
 
 G_M25674_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M25674_IG02:
-       movsx    rax, si
-       vmovd    xmm0, eax
+       vmovd    xmm0, esi
        vpbroadcastw ymm0, ymm0
        vmovups  ymmword ptr [rdi], ymm0
-						;; size=17 bbWeight=1 PerfScore 6.25
+						;; size=13 bbWeight=1 PerfScore 6.00
 G_M25674_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 
-; Total bytes of code 21, prolog size 0, PerfScore 8.25, instruction count 6, allocated bytes for code 21 (MethodHash=d18e9bb5) for method System.Numerics.Vector`1[short]:.ctor(short):this (FullOpts)
+; Total bytes of code 17, prolog size 0, PerfScore 8.00, instruction count 5, allocated bytes for code 17 (MethodHash=d18e9bb5) for method System.Numerics.Vector`1[short]:.ctor(short):this (FullOpts)
 ; ============================================================
-4 (-13.33 % of base) - System.Numerics.Vector`1[short]:op_Multiply(short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short]
 ; Assembly listing for method System.Numerics.Vector`1[short]:op_Multiply(short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T01] (  3,  3   )   short  ->  rsi         single-def
 ;  V02 arg1         [V02,T02] (  1,  1   )  simd32  ->  [rsp+0x08]  single-def <System.Numerics.Vector`1[short]>
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;
 ; Lcl frame size = 0
 
 G_M16008_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M16008_IG02:
-       movsx    rax, si
-       vmovd    xmm0, eax
+       vmovd    xmm0, esi
        vpbroadcastw ymm0, ymm0
        vpmullw  ymm0, ymm0, ymmword ptr [rsp+0x08]
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
-						;; size=26 bbWeight=1 PerfScore 12.50
+						;; size=22 bbWeight=1 PerfScore 12.25
 G_M16008_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 
-; Total bytes of code 30, prolog size 0, PerfScore 14.50, instruction count 8, allocated bytes for code 30 (MethodHash=8623c177) for method System.Numerics.Vector`1[short]:op_Multiply(short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 14.25, instruction count 7, allocated bytes for code 26 (MethodHash=8623c177) for method System.Numerics.Vector`1[short]:op_Multiply(short,System.Numerics.Vector`1[short]):System.Numerics.Vector`1[short] (FullOpts)
 ; ============================================================
-4 (-13.33 % of base) - System.Numerics.Vector`1[short]:op_Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short]
 ; Assembly listing for method System.Numerics.Vector`1[short]:op_Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T02] (  1,  1   )  simd32  ->  [rsp+0x08]  single-def <System.Numerics.Vector`1[short]>
 ;  V02 arg1         [V02,T01] (  3,  3   )   short  ->  rsi         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;
 ; Lcl frame size = 0
 
 G_M61576_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M61576_IG02:
-       movsx    rax, si
-       vmovd    xmm0, eax
+       vmovd    xmm0, esi
        vpbroadcastw ymm0, ymm0
        vpmullw  ymm0, ymm0, ymmword ptr [rsp+0x08]
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
-						;; size=26 bbWeight=1 PerfScore 12.50
+						;; size=22 bbWeight=1 PerfScore 12.25
 G_M61576_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 
-; Total bytes of code 30, prolog size 0, PerfScore 14.50, instruction count 8, allocated bytes for code 30 (MethodHash=45690f77) for method System.Numerics.Vector`1[short]:op_Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 14.25, instruction count 7, allocated bytes for code 26 (MethodHash=45690f77) for method System.Numerics.Vector`1[short]:op_Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
 ; ============================================================
-4 (-11.76 % of base) - System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector,T>.Any(System.Numerics.Vector`1[short],short):ubyte
 ; Assembly listing for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Any(System.Numerics.Vector`1[short],short):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 arg0         [V00,T02] (  1,  1   )  simd32  ->  [rsp+0x08]  single-def <System.Numerics.Vector`1[short]>
 ;  V01 arg1         [V01,T00] (  3,  3   )   short  ->  rdi         single-def
 ;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;  V03 rat0         [V03,T01] (  3,  6   )  simd32  ->  mm0         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
 G_M23858_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M23858_IG02:
-       movsx    rax, di
-       vmovd    xmm0, eax
+       vmovd    xmm0, edi
        vpbroadcastw ymm0, ymm0
        vpcmpeqw ymm0, ymm0, ymmword ptr [rsp+0x08]
        vptest   ymm0, ymm0
        setne    al
        movzx    rax, al
-						;; size=30 bbWeight=1 PerfScore 12.50
+						;; size=26 bbWeight=1 PerfScore 12.25
 G_M23858_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 
-; Total bytes of code 34, prolog size 0, PerfScore 14.50, instruction count 9, allocated bytes for code 34 (MethodHash=a9bca2cd) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Any(System.Numerics.Vector`1[short],short):ubyte (FullOpts)
+; Total bytes of code 30, prolog size 0, PerfScore 14.25, instruction count 8, allocated bytes for code 30 (MethodHash=a9bca2cd) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Any(System.Numerics.Vector`1[short],short):ubyte (FullOpts)
 ; ============================================================
-4 (-16.67 % of base) - System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector,T>.Create(short):System.Numerics.Vector`1[short]
 ; Assembly listing for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Create(short):System.Numerics.Vector`1[short] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T01] (  3,  3   )   short  ->  rsi         single-def
 ;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;
 ; Lcl frame size = 0
 
 G_M23411_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M23411_IG02:
-       movsx    rax, si
-       vmovd    xmm0, eax
+       vmovd    xmm0, esi
        vpbroadcastw ymm0, ymm0
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
-						;; size=20 bbWeight=1 PerfScore 6.50
+						;; size=16 bbWeight=1 PerfScore 6.25
 G_M23411_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 
-; Total bytes of code 24, prolog size 0, PerfScore 8.50, instruction count 7, allocated bytes for code 24 (MethodHash=99b6a48c) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Create(short):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 20, prolog size 0, PerfScore 8.25, instruction count 6, allocated bytes for code 20 (MethodHash=99b6a48c) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Create(short):System.Numerics.Vector`1[short] (FullOpts)
 ; ============================================================
-4 (-13.33 % of base) - System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector,T>.Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short]
 ; Assembly listing for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Unix
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
 ;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rdi         single-def
 ;  V01 arg0         [V01,T02] (  1,  1   )  simd32  ->  [rsp+0x08]  single-def <System.Numerics.Vector`1[short]>
 ;  V02 arg1         [V02,T01] (  3,  3   )   short  ->  rsi         single-def
 ;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
 ;
 ; Lcl frame size = 0
 
 G_M5366_IG01:
 						;; size=0 bbWeight=1 PerfScore 0.00
 G_M5366_IG02:
-       movsx    rax, si
-       vmovd    xmm0, eax
+       vmovd    xmm0, esi
        vpbroadcastw ymm0, ymm0
        vpmullw  ymm0, ymm0, ymmword ptr [rsp+0x08]
        vmovups  ymmword ptr [rdi], ymm0
        mov      rax, rdi
-						;; size=26 bbWeight=1 PerfScore 12.50
+						;; size=22 bbWeight=1 PerfScore 12.25
 G_M5366_IG03:
        vzeroupper 
        ret      
 						;; size=4 bbWeight=1 PerfScore 2.00
 
-; Total bytes of code 30, prolog size 0, PerfScore 14.50, instruction count 8, allocated bytes for code 30 (MethodHash=9611eb09) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 14.25, instruction count 7, allocated bytes for code 26 (MethodHash=9611eb09) for method System.Numerics.Vector`1[short]:System.Runtime.Intrinsics.ISimdVector<System.Numerics.Vector<T>,T>.Multiply(System.Numerics.Vector`1[short],short):System.Numerics.Vector`1[short] (FullOpts)
 ; ============================================================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment