Lines Copies Function name ----- ------ ------------- 30107 (100%) 477 (100%) (TOTAL) 4970 (16.5%) 2 (0.4%) matrixmultiply::dgemm_kernel::kernel_x86_avx 3770 (12.5%) 2 (0.4%) matrixmultiply::sgemm_kernel::kernel_x86_avx 3088 (10.3%) 8 (1.7%) matrixmultiply::gemm::gemm_loop 2184 (7.3%) 16 (3.4%) matrixmultiply::gemm::gemm_packed::{{closure}} 1628 (5.4%) 4 (0.8%) matrixmultiply::gemm::pack 1624 (5.4%) 16 (3.4%) matrixmultiply::threading::RangeChunkParallel::for_each 1312 (4.4%) 16 (3.4%) matrixmultiply::gemm::gemm_loop::{{closure}} 1128 (3.7%) 8 (1.7%) matrixmultiply::gemm::c_to_masked_ab_beta_c 960 (3.2%) 8 (1.7%) matrixmultiply::gemm::make_packing_buffer 904 (3.0%) 8 (1.7%) matrixmultiply::gemm::gemm_packed 896 (3.0%) 8 (1.7%) matrixmultiply::gemm::ensure_kernel_params 744 (2.5%) 8 (1.7%) as matrixmultiply::kernel::GemmSelect>::select 608 (2.0%) 8 (1.7%) std::thread::local::LocalKey::try_with 416 (1.4%) 16 (3.4%) matrixmultiply::threading::RangeChunkParallel::thread_local 360 (1.2%) 8 (1.7%) matrixmultiply::gemm::masked_kernel 332 (1.1%) 1 (0.2%) matrixmultiply::dgemm_kernel::kernel_fallback_impl 332 (1.1%) 1 (0.2%) matrixmultiply::sgemm_kernel::kernel_fallback_impl 212 (0.7%) 2 (0.4%) matrixmultiply::gemm::c_to_beta_c 200 (0.7%) 8 (1.7%) matrixmultiply::threading::LoopThreadConfig::new 152 (0.5%) 8 (1.7%) rawpointer::PointerExt::stride_offset 135 (0.4%) 3 (0.6%) core::mem::replace 108 (0.4%) 2 (0.4%) matrixmultiply::aligned_alloc::Alloc::new 96 (0.3%) 8 (1.7%) matrixmultiply::gemm::gemm_packed::{{closure}}::{{closure}} 96 (0.3%) 8 (1.7%) std::thread::local::LocalKey::with 84 (0.3%) 1 (0.2%) matrixmultiply::dgemm_kernel::detect 84 (0.3%) 1 (0.2%) matrixmultiply::sgemm_kernel::detect 74 (0.2%) 1 (0.2%) matrixmultiply::gemm::dgemm 74 (0.2%) 1 (0.2%) matrixmultiply::gemm::sgemm 72 (0.2%) 4 (0.8%) core::panicking::assert_failed 70 (0.2%) 1 (0.2%) core::cmp::max_by 70 (0.2%) 1 (0.2%) core::cmp::min_by 69 (0.2%) 1 (0.2%) std::thread::local::lazy::LazyKeyInner::initialize 65 (0.2%) 1 (0.2%) ::next 64 (0.2%) 2 (0.4%) as core::ops::drop::Drop>::drop 59 (0.2%) 1 (0.2%) core::fmt::Arguments::new_v1 58 (0.2%) 2 (0.4%) core::ptr::metadata::from_raw_parts_mut 58 (0.2%) 1 (0.2%) std::thread::local::fast::Key::get 57 (0.2%) 1 (0.2%) std::thread::local::fast::Key::try_initialize 54 (0.2%) 1 (0.2%) matrixmultiply::util::round_up_to 51 (0.2%) 3 (0.6%) core::ptr::read 50 (0.2%) 1 (0.2%) std_detect::detect::cache::test 49 (0.2%) 1 (0.2%) core::alloc::layout::Layout::from_size_align 48 (0.2%) 1 (0.2%) core::result::Result::unwrap 46 (0.2%) 1 (0.2%) core::result::Result::expect 43 (0.1%) 1 (0.2%) matrixmultiply::gemm::align_ptr 41 (0.1%) 1 (0.2%) core::core_arch::x86::avx::_mm256_setr_ps 40 (0.1%) 5 (1.0%) core::core_arch::x86::avx::_mm256_permute2f128_pd 40 (0.1%) 4 (0.8%) <&T as core::fmt::Debug>::fmt 40 (0.1%) 3 (0.6%) core::intrinsics::copy_nonoverlapping 39 (0.1%) 1 (0.2%) core::option::Option::unwrap_or_else 36 (0.1%) 3 (0.6%) core::ptr::mut_ptr::::offset 36 (0.1%) 1 (0.2%) as core::iter::range::RangeIteratorImpl>::spec_next 36 (0.1%) 1 (0.2%) core::option::Option::ok_or 36 (0.1%) 1 (0.2%) core::sync::atomic::atomic_load 33 (0.1%) 1 (0.2%) core::cell::Cell::replace 33 (0.1%) 1 (0.2%) std::thread::local::fast::destroy_value 32 (0.1%) 4 (0.8%) matrixmultiply::ptr::Ptr 32 (0.1%) 1 (0.2%) core::fmt::num::::fmt 32 (0.1%) 1 (0.2%) core::fmt::num::::fmt 32 (0.1%) 1 (0.2%) std::thread::local::fast::Key::try_register_dtor 30 (0.1%) 2 (0.4%) core::ptr::slice_from_raw_parts_mut 30 (0.1%) 2 (0.4%) core::slice::raw::from_raw_parts_mut 30 (0.1%) 1 (0.2%) as core::ops::try_trait::Try>::branch 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) ::kernel 27 (0.1%) 1 (0.2%) matrixmultiply::dgemm_kernel::kernel_target_avx 27 (0.1%) 1 (0.2%) matrixmultiply::dgemm_kernel::kernel_target_fma 27 (0.1%) 1 (0.2%) matrixmultiply::dgemm_kernel::kernel_target_sse2 27 (0.1%) 1 (0.2%) matrixmultiply::sgemm_kernel::kernel_target_avx 27 (0.1%) 1 (0.2%) matrixmultiply::sgemm_kernel::kernel_target_fma 27 (0.1%) 1 (0.2%) matrixmultiply::sgemm_kernel::kernel_target_sse2 27 (0.1%) 1 (0.2%) std_detect::detect::cache::Cache::test 26 (0.1%) 3 (0.6%) core::ptr::write 24 (0.1%) 3 (0.6%) core::core_arch::x86::avx::_mm256_permute2f128_ps 24 (0.1%) 3 (0.6%) core::core_arch::x86::avx::_mm256_shuffle_pd 24 (0.1%) 3 (0.6%) core::mem::maybe_uninit::MaybeUninit::assume_init 24 (0.1%) 3 (0.6%) core::slice::::as_mut_ptr 24 (0.1%) 2 (0.4%) as rawpointer::PointerExt>::offset 24 (0.1%) 2 (0.4%) as rawpointer::PointerExt>::offset 24 (0.1%) 2 (0.4%) core::ptr::const_ptr::::offset 24 (0.1%) 1 (0.2%) core::cmp::impls::::cmp 24 (0.1%) 1 (0.2%) core::option::Option::as_ref 22 (0.1%) 2 (0.4%) core::ops::function::FnOnce::call_once 22 (0.1%) 1 (0.2%) matrixmultiply::threading::::parallel 21 (0.1%) 1 (0.2%) ::multiply_add 21 (0.1%) 1 (0.2%) ::multiply_add 21 (0.1%) 1 (0.2%) core::core_arch::x86::avx::_mm256_setr_pd 20 (0.1%) 4 (0.8%) matrixmultiply::ptr::Ptr::ptr 20 (0.1%) 2 (0.4%) core::core_arch::x86::avx::_mm256_extractf128_pd 20 (0.1%) 2 (0.4%) core::core_arch::x86::avx::_mm256_extractf128_ps 20 (0.1%) 1 (0.2%) core::alloc::layout::Layout::from_size_align_unchecked 20 (0.1%) 1 (0.2%) matrixmultiply::threading::get_thread_pool 18 (0.1%) 4 (0.8%) core::cell::UnsafeCell::get 18 (0.1%) 2 (0.4%) <*const T as rawpointer::PointerExt>::offset 18 (0.1%) 2 (0.4%) <*mut T as rawpointer::PointerExt>::offset 18 (0.1%) 2 (0.4%) core::ptr::const_ptr::::add 18 (0.1%) 2 (0.4%) core::ptr::mut_ptr::::add 18 (0.1%) 2 (0.4%) matrixmultiply::ptr::Ptr<*mut T>::to_const 18 (0.1%) 2 (0.4%) rawpointer::PointerExt::add 18 (0.1%) 1 (0.2%) as core::ops::try_trait::FromResidual>>::from_residual 18 (0.1%) 1 (0.2%) std_detect::detect::cache::test_bit 16 (0.1%) 2 (0.4%) core::slice::::as_ptr 16 (0.1%) 2 (0.4%) rawpointer::PointerExt::inc 16 (0.1%) 1 (0.2%) alloc::alloc::dealloc 15 (0.0%) 1 (0.2%) ::multiply_add 15 (0.0%) 1 (0.2%) ::multiply_add 14 (0.0%) 3 (0.6%) core::mem::manually_drop::ManuallyDrop::into_inner 14 (0.0%) 2 (0.4%) ::into_iter 14 (0.0%) 2 (0.4%) core::slice::raw::debug_check_data_len 14 (0.0%) 1 (0.2%) core::cell::Cell::set 14 (0.0%) 1 (0.2%) core::ptr::mut_ptr::::guaranteed_eq 13 (0.0%) 3 (0.6%) core::mem::maybe_uninit::MaybeUninit::as_mut_ptr 13 (0.0%) 1 (0.2%) alloc::alloc::alloc 13 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_loadu_pd 13 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_loadu_ps 13 (0.0%) 1 (0.2%) matrixmultiply::util::range_chunk 12 (0.0%) 2 (0.4%) matrixmultiply::aligned_alloc::Alloc::ptr_mut 12 (0.0%) 1 (0.2%) core::core_arch::x86::sse2::_mm_storeh_pd 12 (0.0%) 1 (0.2%) core::core_arch::x86::sse2::_mm_storel_pd 12 (0.0%) 1 (0.2%) core::core_arch::x86::sse::_mm_set1_ps 12 (0.0%) 1 (0.2%) core::core_arch::x86::sse::_mm_store_ss 12 (0.0%) 1 (0.2%) core::num::::unchecked_add 12 (0.0%) 1 (0.2%) core::sync::atomic::AtomicUsize::load 11 (0.0%) 1 (0.2%) matrixmultiply::gemm::MASK_BUF::__init 10 (0.0%) 3 (0.6%) core::mem::maybe_uninit::MaybeUninit::uninit 10 (0.0%) 2 (0.4%) core::ptr::mut_ptr::::cast 10 (0.0%) 1 (0.2%) ::add_assign 10 (0.0%) 1 (0.2%) ::mul_assign 10 (0.0%) 1 (0.2%) ::add_assign 10 (0.0%) 1 (0.2%) ::mul_assign 10 (0.0%) 1 (0.2%) core::cell::UnsafeCell::new 10 (0.0%) 1 (0.2%) core::cmp::impls::::lt 10 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_permute_ps 10 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm_permute_ps 10 (0.0%) 1 (0.2%) core::core_arch::x86::fma::_mm256_fmadd_pd 10 (0.0%) 1 (0.2%) core::core_arch::x86::fma::_mm256_fmadd_ps 10 (0.0%) 1 (0.2%) core::iter::range::>::next 10 (0.0%) 1 (0.2%) core::num::::count_ones 10 (0.0%) 1 (0.2%) matrixmultiply::dgemm_kernel::at 10 (0.0%) 1 (0.2%) matrixmultiply::sgemm_kernel::at 10 (0.0%) 1 (0.2%) std_detect::detect::cache::test::{{closure}} 9 (0.0%) 1 (0.2%) ::forward_unchecked 9 (0.0%) 1 (0.2%) core::cmp::Ord::max 9 (0.0%) 1 (0.2%) core::cmp::Ord::min 9 (0.0%) 1 (0.2%) core::cmp::max 9 (0.0%) 1 (0.2%) core::cmp::min 9 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_storeu_pd 9 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_storeu_ps 9 (0.0%) 1 (0.2%) core::option::Option::take 9 (0.0%) 1 (0.2%) std::thread::local::lazy::LazyKeyInner::get 9 (0.0%) 1 (0.2%) std::thread::local::lazy::LazyKeyInner::take 9 (0.0%) 1 (0.2%) std_detect::detect::cache::Initializer::test 9 (0.0%) 1 (0.2%) std_detect::detect::check_for 8 (0.0%) 2 (0.4%) core::mem::drop 8 (0.0%) 1 (0.2%) core::alloc::layout::Layout::align 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_add_pd 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_add_ps 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_blend_pd 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_mul_pd 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_mul_ps 8 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_shuffle_ps 8 (0.0%) 1 (0.2%) core::ptr::mut_ptr::::is_null 7 (0.0%) 1 (0.2%) core::cell::Cell::get 7 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_broadcast_sd 7 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_load_pd 7 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_load_ps 7 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_movehdup_ps 7 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_moveldup_ps 7 (0.0%) 1 (0.2%) core::num::::is_power_of_two 7 (0.0%) 1 (0.2%) core::num::nonzero::NonZeroUsize::new_unchecked 7 (0.0%) 1 (0.2%) matrixmultiply::threading::LoopThreadConfig::num_pack_a 6 (0.0%) 1 (0.2%) ::is_zero 6 (0.0%) 1 (0.2%) ::is_zero 6 (0.0%) 1 (0.2%) core::alloc::layout::Layout::size 6 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_set1_pd 6 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_set1_ps 6 (0.0%) 1 (0.2%) core::mem::needs_drop 6 (0.0%) 1 (0.2%) core::ptr::drop_in_place> 6 (0.0%) 1 (0.2%) core::ptr::drop_in_place> 6 (0.0%) 1 (0.2%) std_detect::detect::arch::__is_feature_detected::avx 6 (0.0%) 1 (0.2%) std_detect::detect::arch::__is_feature_detected::fma 5 (0.0%) 5 (1.0%) core::mem::size_of 5 (0.0%) 1 (0.2%) core::clone::impls::::clone 4 (0.0%) 1 (0.2%) core::num::nonzero::NonZeroUsize::get 4 (0.0%) 1 (0.2%) core::ptr::drop_in_place<&f32> 4 (0.0%) 1 (0.2%) core::ptr::drop_in_place<&f64> 4 (0.0%) 1 (0.2%) core::ptr::drop_in_place<&i32> 4 (0.0%) 1 (0.2%) core::ptr::drop_in_place<&usize> 4 (0.0%) 1 (0.2%) core::ptr::drop_in_place 4 (0.0%) 1 (0.2%) core::ptr::drop_in_place 4 (0.0%) 1 (0.2%) matrixmultiply::threading::Registry::thread_pool 4 (0.0%) 1 (0.2%) matrixmultiply::threading::ThreadPool::top 3 (0.0%) 1 (0.2%) >::from 3 (0.0%) 1 (0.2%) ::align_to 3 (0.0%) 1 (0.2%) ::always_masked 3 (0.0%) 1 (0.2%) ::align_to 3 (0.0%) 1 (0.2%) ::always_masked 3 (0.0%) 1 (0.2%) ::always_masked 3 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_setzero_pd 3 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_setzero_ps 3 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_undefined_pd 3 (0.0%) 1 (0.2%) core::core_arch::x86::avx::_mm256_undefined_ps 3 (0.0%) 1 (0.2%) core::core_arch::x86::sse::_mm_undefined_ps 3 (0.0%) 1 (0.2%) matrixmultiply::gemm::MASK_BUF::__getit 2 (0.0%) 2 (0.4%) core::mem::align_of 1 (0.0%) 1 (0.2%) ::one 1 (0.0%) 1 (0.2%) ::test_value 1 (0.0%) 1 (0.2%) ::zero 1 (0.0%) 1 (0.2%) ::one 1 (0.0%) 1 (0.2%) ::test_value 1 (0.0%) 1 (0.2%) ::zero 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::always_masked 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::always_masked 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::always_masked 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::always_masked 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::always_masked 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) ::align_to 1 (0.0%) 1 (0.2%) ::kc 1 (0.0%) 1 (0.2%) ::mc 1 (0.0%) 1 (0.2%) ::nc 1 (0.0%) 1 (0.2%) core::hint::unreachable_unchecked 1 (0.0%) 1 (0.2%) core::ptr::null_mut 1 (0.0%) 1 (0.2%) matrixmultiply::threading::::parallel::nop