Here is another possibility. I find that octave_value_list is often taking
~1% of an particular leaf function. If I check the annotated code I see
that atomic locking instructions take a very long time.
--- Start Code Annotation ---
octave_value_list::octave_value_list
/home/rik/wip/Projects_Mine/octave-dev/libgui/.libs/liboctgui.so.5.0.0
Samples│ _ZN17octave_value_listC2Ev():
│ OCTINTERP_API
│ octave_value_list
│ {
│ public:
│
│ octave_value_list (void)
│ push %rbp
11 │ mov %rsp,%rbp
7 │ push %r12
1 │ push %rbx
│ _ZN17octave_value_listC1Ev():
1 │ add $0x10,%rax
│ _ZN17octave_value_listC2Ev():
15 │ mov %rdi,%rbx
│ _ZN17octave_value_listC1Ev():
6 │ mov %rax,(%rdi)
│
│ public:
│
│ static octave_idx_type dim_max (void);
│
│ explicit dim_vector (void) : rep (nil_rep ())
1 │ → callq dim_vector::nil_rep()@plt
3 │ mov %rax,0x8(%rbx)
│ { OCTAVE_ATOMIC_INCREMENT (&(count ())); }
327 │ lock addq $0x1,-0x10(%rax)
│ : dimensions (), rep (nil_rep ()), slice_data (rep->data),
1 │ → callq Array<octave_value
│ slice_len (rep->len)
10 │ mov (%rax),%rdx
7 │ mov %rax,0x10(%rbx)
│ mov %rdx,0x18(%rbx)
│ mov 0x8(%rax),%rdx
14 │ mov %rdx,0x20(%rbx)
│ return OCTAVE_ATOMIC_INCREMENT (&m_count);
│ }
│
│ count_type operator++ (int)
│ {
│ return OCTAVE_ATOMIC_POST_INCREMENT (&m_count);
297 │ lock addl $0x1,0x10(%rax)
1 │ mov vtable for Array<std::__cxx11::basic_string<char,
std::char_traits<char,%rax
│ add $0x10,%rax
16 │ mov %rax,0x28(%rbx)
│ explicit dim_vector (void) : rep (nil_rep ())
│ → callq dim_vector::nil_rep()@plt
│ mov %rax,0x30(%rbx)
│ { OCTAVE_ATOMIC_INCREMENT (&(count ())); }
294 │ lock addq $0x1,-0x10(%rax)
│ : dimensions (), rep (nil_rep ()), slice_data (rep->data),
1 │ → callq Array<std::__cxx11::basic_string<char,
std::char_traits<char
│ slice_len (rep->len)
--- End Code Annotation ---
I can change the atomic instructions to ordinary ones by configuring with
--disable-atomic-refcount. The benchmark runtime drops from 14.1 seconds
to 11.6 seconds (2.5 seconds) which seems important.