Lines Copies Function name ----- ------ ------------- 46689 (100%) 458 (100%) (TOTAL) 10626 (22.8%) 2 (0.4%) matrixmultiply::sgemm_kernel::kernel_x86_avx 8840 (18.9%) 2 (0.4%) matrixmultiply::dgemm_kernel::kernel_x86_avx 3824 (8.2%) 1 (0.2%) matrixmultiply::sgemm_kernel::kernel_fallback_impl 3152 (6.8%) 8 (1.7%) matrixmultiply::gemm::gemm_loop 2232 (4.8%) 16 (3.5%) matrixmultiply::threading::RangeChunkParallel::for_each 2144 (4.6%) 16 (3.5%) matrixmultiply::gemm::gemm_packed::{{closure}} 2020 (4.3%) 1 (0.2%) matrixmultiply::dgemm_kernel::kernel_fallback_impl 1876 (4.0%) 4 (0.9%) matrixmultiply::gemm::pack 1432 (3.1%) 8 (1.7%) matrixmultiply::gemm::c_to_masked_ab_beta_c 1112 (2.4%) 16 (3.5%) matrixmultiply::gemm::gemm_loop::{{closure}} 944 (2.0%) 8 (1.7%) matrixmultiply::gemm::gemm_packed 776 (1.7%) 8 (1.7%) as matrixmultiply::kernel::GemmSelect>::select 776 (1.7%) 8 (1.7%) matrixmultiply::gemm::ensure_kernel_params 720 (1.5%) 8 (1.7%) std::thread::local::LocalKey::try_with 528 (1.1%) 8 (1.7%) matrixmultiply::gemm::make_packing_buffer 512 (1.1%) 16 (3.5%) matrixmultiply::threading::RangeChunkParallel::thread_local 360 (0.8%) 8 (1.7%) matrixmultiply::gemm::masked_kernel 274 (0.6%) 2 (0.4%) matrixmultiply::gemm::c_to_beta_c 200 (0.4%) 8 (1.7%) matrixmultiply::threading::LoopThreadConfig::new 157 (0.3%) 3 (0.7%) core::mem::replace 115 (0.2%) 1 (0.2%) matrixmultiply::dgemm_kernel::detect 115 (0.2%) 1 (0.2%) matrixmultiply::sgemm_kernel::detect 104 (0.2%) 8 (1.7%) rawpointer::PointerExt::stride_offset 96 (0.2%) 8 (1.7%) matrixmultiply::gemm::gemm_packed::{{closure}}::{{closure}} 96 (0.2%) 8 (1.7%) std::thread::local::LocalKey::with 89 (0.2%) 1 (0.2%) std::thread::local::lazy::LazyKeyInner::initialize 88 (0.2%) 2 (0.4%) matrixmultiply::aligned_alloc::Alloc::new 80 (0.2%) 1 (0.2%) core::cmp::max_by 80 (0.2%) 1 (0.2%) core::cmp::min_by 78 (0.2%) 1 (0.2%) matrixmultiply::gemm::dgemm 78 (0.2%) 1 (0.2%) matrixmultiply::gemm::sgemm 74 (0.2%) 2 (0.4%) core::ptr::metadata::from_raw_parts_mut 66 (0.1%) 1 (0.2%) std::thread::local::fast::Key::get 65 (0.1%) 3 (0.7%) core::ptr::read 63 (0.1%) 1 (0.2%) std::thread::local::fast::Key::try_initialize 62 (0.1%) 1 (0.2%) core::fmt::Arguments::new_v1 57 (0.1%) 1 (0.2%) ::next 53 (0.1%) 1 (0.2%) std_detect::detect::cache::test 52 (0.1%) 2 (0.4%) as core::ops::drop::Drop>::drop 50 (0.1%) 1 (0.2%) core::result::Result::expect 50 (0.1%) 1 (0.2%) matrixmultiply::util::round_up_to 48 (0.1%) 3 (0.7%) core::ptr::mut_ptr::::offset 45 (0.1%) 1 (0.2%) std::thread::local::fast::destroy_value 41 (0.1%) 1 (0.2%) core::core_arch::x86::avx::_mm256_setr_ps 40 (0.1%) 5 (1.1%) core::core_arch::x86::avx::_mm256_permute2f128_pd 40 (0.1%) 3 (0.7%) core::intrinsics::copy_nonoverlapping 40 (0.1%) 1 (0.2%) core::sync::atomic::atomic_load 40 (0.1%) 1 (0.2%) matrixmultiply::gemm::align_ptr 39 (0.1%) 1 (0.2%) core::option::Option::unwrap_or_else 37 (0.1%) 1 (0.2%) core::cell::Cell::replace 36 (0.1%) 1 (0.2%) as core::iter::range::RangeIteratorImpl>::spec_next 36 (0.1%) 1 (0.2%) core::option::Option::ok_or 36 (0.1%) 1 (0.2%) std::thread::local::fast::Key::try_register_dtor 32 (0.1%) 4 (0.9%) matrixmultiply::ptr::Ptr 32 (0.1%) 2 (0.4%) core::ptr::const_ptr::::offset 30 (0.1%) 2 (0.4%) core::ptr::slice_from_raw_parts_mut 30 (0.1%) 2 (0.4%) core::slice::raw::from_raw_parts_mut 30 (0.1%) 1 (0.2%) as core::ops::try_trait::Try>::branch 29 (0.1%) 1 (0.2%) std_detect::detect::cache::Cache::test 28 (0.1%) 3 (0.7%) core::mem::maybe_uninit::MaybeUninit::assume_init 28 (0.1%) 2 (0.4%) core::core_arch::x86::avx::_mm256_extractf128_pd 28 (0.1%) 2 (0.4%) core::core_arch::x86::avx::_mm256_extractf128_ps 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) matrixmultiply::dgemm_kernel::kernel_target_avx 27 (0.1%) 1 (0.2%) matrixmultiply::dgemm_kernel::kernel_target_fma 27 (0.1%) 1 (0.2%) matrixmultiply::dgemm_kernel::kernel_target_sse2 27 (0.1%) 1 (0.2%) matrixmultiply::sgemm_kernel::kernel_target_avx 27 (0.1%) 1 (0.2%) matrixmultiply::sgemm_kernel::kernel_target_fma 27 (0.1%) 1 (0.2%) matrixmultiply::sgemm_kernel::kernel_target_sse2 26 (0.1%) 3 (0.7%) core::ptr::write 26 (0.1%) 1 (0.2%) matrixmultiply::threading::::parallel 25 (0.1%) 1 (0.2%) ::multiply_add 25 (0.1%) 1 (0.2%) ::multiply_add 24 (0.1%) 3 (0.7%) core::core_arch::x86::avx::_mm256_permute2f128_ps 24 (0.1%) 3 (0.7%) core::core_arch::x86::avx::_mm256_shuffle_pd 24 (0.1%) 3 (0.7%) core::slice::::as_mut_ptr 24 (0.1%) 2 (0.4%) as rawpointer::PointerExt>::offset 24 (0.1%) 2 (0.4%) as rawpointer::PointerExt>::offset 24 (0.1%) 1 (0.2%) core::cmp::impls::::cmp 24 (0.1%) 1 (0.2%) core::option::Option::as_ref 22 (0.0%) 2 (0.4%) core::ops::function::FnOnce::call_once 21 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_setr_pd 20 (0.0%) 4 (0.9%) matrixmultiply::ptr::Ptr::ptr 20 (0.0%) 1 (0.2%) core::alloc::layout::Layout::from_size_align_unchecked 20 (0.0%) 1 (0.2%) matrixmultiply::threading::get_thread_pool 19 (0.0%) 1 (0.2%) matrixmultiply::gemm::MASK_BUF::__init 18 (0.0%) 4 (0.9%) core::cell::UnsafeCell::get 18 (0.0%) 2 (0.4%) <*const T as rawpointer::PointerExt>::offset 18 (0.0%) 2 (0.4%) <*mut T as rawpointer::PointerExt>::offset 18 (0.0%) 2 (0.4%) core::ptr::const_ptr::::add 18 (0.0%) 2 (0.4%) core::ptr::mut_ptr::::add 18 (0.0%) 2 (0.4%) matrixmultiply::ptr::Ptr<*mut T>::to_const 18 (0.0%) 2 (0.4%) rawpointer::PointerExt::add 18 (0.0%) 1 (0.2%) as core::ops::try_trait::FromResidual>>::from_residual 16 (0.0%) 2 (0.4%) core::slice::::as_ptr 16 (0.0%) 2 (0.4%) rawpointer::PointerExt::inc 16 (0.0%) 1 (0.2%) alloc::alloc::dealloc 16 (0.0%) 1 (0.2%) core::core_arch::x86::sse2::_mm_storeh_pd 16 (0.0%) 1 (0.2%) core::core_arch::x86::sse2::_mm_storel_pd 16 (0.0%) 1 (0.2%) core::core_arch::x86::sse::_mm_store_ss 16 (0.0%) 1 (0.2%) core::num::::unchecked_add 16 (0.0%) 1 (0.2%) core::ptr::mut_ptr::::guaranteed_eq 15 (0.0%) 1 (0.2%) ::multiply_add 15 (0.0%) 1 (0.2%) ::multiply_add 14 (0.0%) 3 (0.7%) core::mem::manually_drop::ManuallyDrop::into_inner 14 (0.0%) 2 (0.4%) ::into_iter 14 (0.0%) 2 (0.4%) core::slice::raw::debug_check_data_len 14 (0.0%) 1 (0.2%) core::cell::Cell::set 14 (0.0%) 1 (0.2%) core::cell::UnsafeCell::new 14 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_permute_ps 14 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm_permute_ps 13 (0.0%) 3 (0.7%) core::mem::maybe_uninit::MaybeUninit::as_mut_ptr 13 (0.0%) 1 (0.2%) alloc::alloc::alloc 13 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_loadu_pd 13 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_loadu_ps 13 (0.0%) 1 (0.2%) core::option::Option::take 13 (0.0%) 1 (0.2%) matrixmultiply::util::range_chunk 13 (0.0%) 1 (0.2%) std_detect::detect::cache::test_bit 12 (0.0%) 2 (0.4%) matrixmultiply::aligned_alloc::Alloc::ptr_mut 12 (0.0%) 1 (0.2%) core::core_arch::x86::sse::_mm_set1_ps 12 (0.0%) 1 (0.2%) core::sync::atomic::AtomicUsize::load 10 (0.0%) 3 (0.7%) core::mem::maybe_uninit::MaybeUninit::uninit 10 (0.0%) 2 (0.4%) core::ptr::mut_ptr::::cast 10 (0.0%) 1 (0.2%) ::add_assign 10 (0.0%) 1 (0.2%) ::mul_assign 10 (0.0%) 1 (0.2%) ::add_assign 10 (0.0%) 1 (0.2%) ::mul_assign 10 (0.0%) 1 (0.2%) core::cmp::impls::::lt 10 (0.0%) 1 (0.2%) core::core_arch::x86::fma::_mm256_fmadd_pd 10 (0.0%) 1 (0.2%) core::core_arch::x86::fma::_mm256_fmadd_ps 10 (0.0%) 1 (0.2%) core::iter::range::>::next 10 (0.0%) 1 (0.2%) matrixmultiply::dgemm_kernel::at 10 (0.0%) 1 (0.2%) matrixmultiply::sgemm_kernel::at 10 (0.0%) 1 (0.2%) std_detect::detect::cache::test::{{closure}} 9 (0.0%) 1 (0.2%) ::forward_unchecked 9 (0.0%) 1 (0.2%) core::cmp::Ord::max 9 (0.0%) 1 (0.2%) core::cmp::Ord::min 9 (0.0%) 1 (0.2%) core::cmp::max 9 (0.0%) 1 (0.2%) core::cmp::min 9 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_storeu_pd 9 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_storeu_ps 9 (0.0%) 1 (0.2%) std::thread::local::lazy::LazyKeyInner::get 9 (0.0%) 1 (0.2%) std::thread::local::lazy::LazyKeyInner::take 9 (0.0%) 1 (0.2%) std_detect::detect::cache::Initializer::test 9 (0.0%) 1 (0.2%) std_detect::detect::check_for 8 (0.0%) 2 (0.4%) core::mem::drop 8 (0.0%) 1 (0.2%) core::alloc::layout::Layout::align 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_add_pd 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_add_ps 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_blend_pd 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_mul_pd 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_mul_ps 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_shuffle_ps 8 (0.0%) 1 (0.2%) core::mem::needs_drop 8 (0.0%) 1 (0.2%) core::ptr::mut_ptr::::is_null 8 (0.0%) 1 (0.2%) std_detect::detect::arch::__is_feature_detected::avx 8 (0.0%) 1 (0.2%) std_detect::detect::arch::__is_feature_detected::fma 7 (0.0%) 1 (0.2%) core::cell::Cell::get 7 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_broadcast_sd 7 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_load_pd 7 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_load_ps 7 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_movehdup_ps 7 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_moveldup_ps 7 (0.0%) 1 (0.2%) core::num::nonzero::NonZeroUsize::new_unchecked 7 (0.0%) 1 (0.2%) matrixmultiply::threading::LoopThreadConfig::num_pack_a 6 (0.0%) 1 (0.2%) ::is_zero 6 (0.0%) 1 (0.2%) ::is_zero 6 (0.0%) 1 (0.2%) core::alloc::layout::Layout::size 6 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_set1_pd 6 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_set1_ps 6 (0.0%) 1 (0.2%) core::ptr::drop_in_place> 6 (0.0%) 1 (0.2%) core::ptr::drop_in_place> 5 (0.0%) 5 (1.1%) core::mem::size_of 5 (0.0%) 1 (0.2%) core::clone::impls::::clone 4 (0.0%) 1 (0.2%) core::num::nonzero::NonZeroUsize::get 4 (0.0%) 1 (0.2%) core::ptr::drop_in_place 4 (0.0%) 1 (0.2%) matrixmultiply::threading::Registry::thread_pool 4 (0.0%) 1 (0.2%) matrixmultiply::threading::ThreadPool::top 3 (0.0%) 1 (0.2%) >::from 3 (0.0%) 1 (0.2%) ::align_to 3 (0.0%) 1 (0.2%) ::always_masked 3 (0.0%) 1 (0.2%) ::align_to 3 (0.0%) 1 (0.2%) ::always_masked 3 (0.0%) 1 (0.2%) ::always_masked 3 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_setzero_pd 3 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_setzero_ps 3 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_undefined_pd 3 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_undefined_ps 3 (0.0%) 1 (0.2%) core::core_arch::x86::sse::_mm_undefined_ps 3 (0.0%) 1 (0.2%) matrixmultiply::gemm::MASK_BUF::__getit 2 (0.0%) 2 (0.4%) core::mem::align_of 1 (0.0%) 1 (0.2%) ::one 1 (0.0%) 1 (0.2%) ::test_value 1 (0.0%) 1 (0.2%) ::zero 1 (0.0%) 1 (0.2%) ::one 1 (0.0%) 1 (0.2%) ::test_value 1 (0.0%) 1 (0.2%) ::zero 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::always_masked 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::always_masked 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::always_masked 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::always_masked 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::always_masked 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) core::hint::unreachable_unchecked 1 (0.0%) 1 (0.2%) core::ptr::null_mut 1 (0.0%) 1 (0.2%) matrixmultiply::threading::::parallel::nop