pub fn glam_mat4_mul(lhs: glam::Mat4, rhs: glam::Mat4) -> glam::Mat4 {
lhs * rhs
}
pub fn nalgebra_mat4_mul(
lhs: nalgebra::Matrix4<f32>,
rhs: nalgebra::Matrix4<f32>,
) -> nalgebra::Matrix4<f32> {
lhs * rhs
}
pub fn cgmath_mat4_mul(
lhs: cgmath::Matrix4<f32>,
rhs: cgmath::Matrix4<f32>,
) -> cgmath::Matrix4<f32> {
lhs * rhs
}
The glam
version is getting inlined.
mathbench::glam_mat4_mul (src/lib.rs:190):
mov rax, rdi
movaps xmm5, xmmword, ptr, [rsi]
movaps xmm1, xmmword, ptr, [rsi, +, 16]
movaps xmm2, xmmword, ptr, [rsi, +, 32]
movaps xmm3, xmmword, ptr, [rsi, +, 48]
movaps xmm7, xmmword, ptr, [rdx]
movaps xmm6, xmmword, ptr, [rdx, +, 16]
movaps xmm9, xmmword, ptr, [rdx, +, 32]
movaps xmm8, xmmword, ptr, [rdx, +, 48]
movaps xmm0, xmm5
shufps xmm0, xmm5, 0
mulps xmm0, xmm7
movaps xmm4, xmm5
shufps xmm4, xmm5, 85
mulps xmm4, xmm6
addps xmm4, xmm0
movaps xmm0, xmm5
shufps xmm0, xmm5, 170
mulps xmm0, xmm9
addps xmm0, xmm4
shufps xmm5, xmm5, 255
mulps xmm5, xmm8
addps xmm5, xmm0
movaps xmm0, xmm1
shufps xmm0, xmm1, 0
mulps xmm0, xmm7
movaps xmm4, xmm1
shufps xmm4, xmm1, 85
mulps xmm4, xmm6
addps xmm4, xmm0
movaps xmm0, xmm1
shufps xmm0, xmm1, 170
mulps xmm0, xmm9
addps xmm0, xmm4
shufps xmm1, xmm1, 255
mulps xmm1, xmm8
addps xmm1, xmm0
movaps xmm0, xmm2
shufps xmm0, xmm2, 0
mulps xmm0, xmm7
movaps xmm4, xmm2
shufps xmm4, xmm2, 85
mulps xmm4, xmm6
addps xmm4, xmm0
movaps xmm0, xmm2
shufps xmm0, xmm2, 170
mulps xmm0, xmm9
addps xmm0, xmm4
shufps xmm2, xmm2, 255
mulps xmm2, xmm8
addps xmm2, xmm0
movaps xmm0, xmm3
shufps xmm0, xmm3, 0
mulps xmm0, xmm7
movaps xmm4, xmm3
shufps xmm4, xmm3, 85
mulps xmm4, xmm6
addps xmm4, xmm0
movaps xmm0, xmm3
shufps xmm0, xmm3, 170
mulps xmm0, xmm9
addps xmm0, xmm4
shufps xmm3, xmm3, 255
mulps xmm3, xmm8
addps xmm3, xmm0
movaps xmmword, ptr, [rdi], xmm5
movaps xmmword, ptr, [rdi, +, 16], xmm1
movaps xmmword, ptr, [rdi, +, 32], xmm2
movaps xmmword, ptr, [rdi, +, 48], xmm3
ret
mathbench::nalgebra_mat4_mul (src/lib.rs:194):
push rbx
sub rsp, 128
mov rbx, rdi
movups xmm0, xmmword, ptr, [rsi]
movups xmm1, xmmword, ptr, [rsi, +, 16]
movups xmm2, xmmword, ptr, [rsi, +, 32]
movups xmm3, xmmword, ptr, [rsi, +, 48]
movaps xmmword, ptr, [rsp, +, 48], xmm3
movaps xmmword, ptr, [rsp, +, 32], xmm2
movaps xmmword, ptr, [rsp, +, 16], xmm1
movaps xmmword, ptr, [rsp], xmm0
movups xmm0, xmmword, ptr, [rdx]
movups xmm1, xmmword, ptr, [rdx, +, 16]
movups xmm2, xmmword, ptr, [rdx, +, 32]
movups xmm3, xmmword, ptr, [rdx, +, 48]
movaps xmmword, ptr, [rsp, +, 112], xmm3
movaps xmmword, ptr, [rsp, +, 96], xmm2
movaps xmmword, ptr, [rsp, +, 80], xmm1
movaps xmmword, ptr, [rsp, +, 64], xmm0
mov rsi, rsp
lea rdx, [rsp, +, 64]
call nalgebra::base::ops::<impl core::ops::arith::Mul<nalgebra::base::matrix::Matrix<N,R2,C2,SB>> for nalgebra::base::matrix::Matrix<N,R1,C1,SA>>::mul
mov rax, rbx
add rsp, 128
pop rbx
ret
cargo asm "nalgebra::base::ops::<impl core::ops::arith::Mul<nalgebra::base::matrix::Matrix<N,R2,C2,SB>> for nalgebra::base::matrix::Matrix<N,R1,C1,SA>>::mul"
nalgebra::base::ops::<impl core::ops::arith::Mul<nalgebra::base::matrix::Matrix<N,R2,C2,SB>> for nalgebra::base::matrix::Matrix<N,R1,C1,SA>>::mul (/home/x/.cargo/registry/src/github.com-1ecc6299db9ec823/nalgebra-0.18.0/src/base/ops.rs:628):
movups xmm3, xmmword, ptr, [rsi]
movups xmm2, xmmword, ptr, [rsi, +, 16]
movups xmm1, xmmword, ptr, [rsi, +, 32]
movups xmm8, xmmword, ptr, [rsi, +, 48]
movss xmm4, dword, ptr, [rdx]
movss xmm5, dword, ptr, [rdx, +, 4]
shufps xmm4, xmm4, 0
mulps xmm4, xmm3
shufps xmm5, xmm5, 0
mulps xmm5, xmm2
addps xmm5, xmm4
movss xmm6, dword, ptr, [rdx, +, 8]
shufps xmm6, xmm6, 0
mulps xmm6, xmm1
addps xmm6, xmm5
movss xmm4, dword, ptr, [rdx, +, 12]
shufps xmm4, xmm4, 0
mulps xmm4, xmm8
addps xmm4, xmm6
movss xmm5, dword, ptr, [rdx, +, 16]
shufps xmm5, xmm5, 0
mulps xmm5, xmm3
movss xmm6, dword, ptr, [rdx, +, 20]
shufps xmm6, xmm6, 0
mulps xmm6, xmm2
addps xmm6, xmm5
movss xmm7, dword, ptr, [rdx, +, 24]
shufps xmm7, xmm7, 0
mulps xmm7, xmm1
addps xmm7, xmm6
movss xmm5, dword, ptr, [rdx, +, 28]
shufps xmm5, xmm5, 0
mulps xmm5, xmm8
addps xmm5, xmm7
movss xmm6, dword, ptr, [rdx, +, 32]
shufps xmm6, xmm6, 0
mulps xmm6, xmm3
movss xmm7, dword, ptr, [rdx, +, 36]
shufps xmm7, xmm7, 0
mulps xmm7, xmm2
addps xmm7, xmm6
movss xmm0, dword, ptr, [rdx, +, 40]
shufps xmm0, xmm0, 0
mulps xmm0, xmm1
addps xmm0, xmm7
movss xmm6, dword, ptr, [rdx, +, 44]
shufps xmm6, xmm6, 0
mulps xmm6, xmm8
addps xmm6, xmm0
movss xmm0, dword, ptr, [rdx, +, 48]
shufps xmm0, xmm0, 0
mulps xmm0, xmm3
movss xmm3, dword, ptr, [rdx, +, 52]
shufps xmm3, xmm3, 0
mulps xmm3, xmm2
addps xmm3, xmm0
movss xmm0, dword, ptr, [rdx, +, 56]
shufps xmm0, xmm0, 0
mulps xmm0, xmm1
addps xmm0, xmm3
movss xmm1, dword, ptr, [rdx, +, 60]
shufps xmm1, xmm1, 0
mulps xmm1, xmm8
addps xmm1, xmm0
movups xmmword, ptr, [rdi], xmm4
movups xmmword, ptr, [rdi, +, 16], xmm5
movups xmmword, ptr, [rdi, +, 32], xmm6
movups xmmword, ptr, [rdi, +, 48], xmm1
ret
mathbench::cgmath_mat4_mul (src/lib.rs:201):
push rbx
sub rsp, 128
mov rbx, rdi
movups xmm0, xmmword, ptr, [rsi]
movups xmm1, xmmword, ptr, [rsi, +, 16]
movups xmm2, xmmword, ptr, [rsi, +, 32]
movups xmm3, xmmword, ptr, [rsi, +, 48]
movaps xmmword, ptr, [rsp, +, 48], xmm3
movaps xmmword, ptr, [rsp, +, 32], xmm2
movaps xmmword, ptr, [rsp, +, 16], xmm1
movaps xmmword, ptr, [rsp], xmm0
movups xmm0, xmmword, ptr, [rdx]
movups xmm1, xmmword, ptr, [rdx, +, 16]
movups xmm2, xmmword, ptr, [rdx, +, 32]
movups xmm3, xmmword, ptr, [rdx, +, 48]
movaps xmmword, ptr, [rsp, +, 112], xmm3
movaps xmmword, ptr, [rsp, +, 96], xmm2
movaps xmmword, ptr, [rsp, +, 80], xmm1
movaps xmmword, ptr, [rsp, +, 64], xmm0
mov rsi, rsp
lea rdx, [rsp, +, 64]
call <cgmath::matrix::Matrix4<S> as core::ops::arith::Mul>::mul
mov rax, rbx
add rsp, 128
pop rbx
ret
<cgmath::matrix::Matrix4<S> as core::ops::arith::Mul>::mul (/home/x/.cargo/registry/src/github.com-1ecc6299db9ec823/cgmath-0.17.0/src/macros.rs:69):
movups xmm4, xmmword, ptr, [rsi]
movups xmm3, xmmword, ptr, [rsi, +, 16]
movups xmm2, xmmword, ptr, [rsi, +, 32]
movups xmm8, xmmword, ptr, [rsi, +, 48]
movss xmm6, dword, ptr, [rdx]
movss xmm7, dword, ptr, [rdx, +, 4]
movss xmm5, dword, ptr, [rdx, +, 8]
movss xmm1, dword, ptr, [rdx, +, 12]
shufps xmm6, xmm6, 0
mulps xmm6, xmm4
shufps xmm7, xmm7, 0
mulps xmm7, xmm3
addps xmm7, xmm6
movss xmm6, dword, ptr, [rdx, +, 16]
shufps xmm5, xmm5, 0
mulps xmm5, xmm2
addps xmm5, xmm7
movss xmm7, dword, ptr, [rdx, +, 20]
shufps xmm1, xmm1, 0
mulps xmm1, xmm8
addps xmm1, xmm5
movss xmm0, dword, ptr, [rdx, +, 24]
shufps xmm6, xmm6, 0
mulps xmm6, xmm4
shufps xmm7, xmm7, 0
mulps xmm7, xmm3
addps xmm7, xmm6
movss xmm5, dword, ptr, [rdx, +, 28]
shufps xmm0, xmm0, 0
mulps xmm0, xmm2
addps xmm0, xmm7
movss xmm6, dword, ptr, [rdx, +, 32]
shufps xmm5, xmm5, 0
mulps xmm5, xmm8
addps xmm5, xmm0
movss xmm0, dword, ptr, [rdx, +, 36]
shufps xmm6, xmm6, 0
mulps xmm6, xmm4
shufps xmm0, xmm0, 0
mulps xmm0, xmm3
addps xmm0, xmm6
movss xmm6, dword, ptr, [rdx, +, 40]
shufps xmm6, xmm6, 0
mulps xmm6, xmm2
addps xmm6, xmm0
movss xmm0, dword, ptr, [rdx, +, 44]
shufps xmm0, xmm0, 0
mulps xmm0, xmm8
addps xmm0, xmm6
movss xmm6, dword, ptr, [rdx, +, 48]
shufps xmm6, xmm6, 0
mulps xmm6, xmm4
movss xmm4, dword, ptr, [rdx, +, 52]
shufps xmm4, xmm4, 0
mulps xmm4, xmm3
addps xmm4, xmm6
movss xmm3, dword, ptr, [rdx, +, 56]
shufps xmm3, xmm3, 0
mulps xmm3, xmm2
addps xmm3, xmm4
movss xmm2, dword, ptr, [rdx, +, 60]
shufps xmm2, xmm2, 0
mulps xmm2, xmm8
addps xmm2, xmm3
movups xmmword, ptr, [rdi], xmm1
movups xmmword, ptr, [rdi, +, 16], xmm5
movups xmmword, ptr, [rdi, +, 32], xmm0
movups xmmword, ptr, [rdi, +, 48], xmm2
ret