Performance improvement for std::list::sort.

  • Thread starter Thread starter rcgldrc
  • Start date Start date
R

rcgldrc

Guest
In the transition from VS 2013 to VS 2015, std::list::sort() was changed from a bottom up merge sort using an array of lists to top down merge sort using iterators. The switch to using iterators eliminated allocations issues such as a caller passed list with no default allocator. Although the no default allocator issue could be handled with the initial declaration of all members of the array of lists with _Myt(get_allocator()), with the switch to iterators, the merge logic which is implemented via splice() to move nodes within a list would be operating only on the caller's list, providing exception safety: if the compare throws an exception, the caller's list would be reordered, but all there, instead of parts of it scattered within the array of list.

The performance issue with the top down strategy is this line:

iterator _Mid = _STD next(_First, static_cast<difference_type>(_Size >> 1));
The code is recursively scanning lists and sub-lists in order to split them into two parts. If the nodes are within cache boundaries, it is not much of a performance hit, but if the list is large and the nodes are randomly scattered, top down strategy takes about 40% longer.


The switch to top down strategy wasn't needed, as the prior bottom up strategy can be converted to using an array of iterators to keep track of runs within the caller's list, using the same merge via splice logic to move nodes within the callers list. It turned out to be relatively straight forward as all merges involved adjacent runs, and the array only needs to store the iterators to the start of runs, as a prior (non-empty) entry or a local variable will contain the iterator to the end of a run.

Example standalone code:

template <typename T>
typename std::list<T>::iterator Merge(std::list<T> &ll,
typename std::list<T>::iterator li,
typename std::list<T>::iterator ri,
typename std::list<T>::iterator ei);

// iterator array size
#define ASZ 32

template <typename T>
void SortList(std::list<T> &ll)
{
if (ll.size() < 2) // return if nothing to do
return;
std::list<T>::iterator ai[ASZ]; // array of iterators
std::list<T>::iterator li; // left iterator
std::list<T>::iterator ri; // right iterator
std::list<T>::iterator ei; // end iterator
size_t i;
for (i = 0; i < ASZ; i++) // "clear" array
ai = ll.end();
// merge nodes into array
for (ei = ll.begin(); ei != ll.end();) {
ri = ei++;
for (i = 0; (i < ASZ) && ai != ll.end(); i++) {
ri = Merge(ll, ai, ri, ei);
ai = ll.end();
}
if(i == ASZ)
i--;
ai = ri;
}
// merge array into single list
ei = ll.end();
for(i = 0; (i < ASZ) && ai == ei; i++);
ri = ai[i++];
while(1){
for( ; (i < ASZ) && ai == ei; i++);
if (i == ASZ)
break;
li = ai[i++];
ri = Merge(ll, li, ri, ei);
}
}

template <typename T>
typename std::list<T>::iterator Merge(std::list<T> &ll,
typename std::list<T>::iterator li,
typename std::list<T>::iterator ri,
typename std::list<T>::iterator ei)
{
std::list<T>::iterator ni;
(*ri < *li) ? ni = ri : ni = li;
while(1){
if(*ri < *li){
ll.splice(li, ll, ri++);
if(ri == ei)
return ni;
} else {
if(++li == ri)
return ni;
}
}
}



Example replacement for _Sort() in VS 2019's version of <list>. The merge code is the same, but moved into a separate function _Merge().

private:
template <class _Pr2>
iterator _Merge(_Pr2 _Pred, iterator _First, iterator _Mid, iterator _Last){
iterator _Newfirst = _First;
for (bool _Initial_loop = true;;
_Initial_loop = false) { // [_First, _Mid) and [_Mid, _Last) are sorted and non-empty
if (_DEBUG_LT_PRED(_Pred, *_Mid, *_First)) { // consume _Mid
if (_Initial_loop) {
_Newfirst = _Mid; // update return value
}
splice(_First, *this, _Mid++);
if (_Mid == _Last) {
return _Newfirst; // exhausted [_Mid, _Last); done
}
}
else { // consume _First
++_First;
if (_First == _Mid) {
return _Newfirst; // exhausted [_First, _Mid); done
}
}
}
}

template <class _Pr2>
void _Sort(iterator _First, iterator _Last, _Pr2 _Pred,
size_type _Size) { // order [_First, _Last), using _Pred, return new first
// _Size must be distance from _First to _Last
if (_Size < 2) {
return; // nothing to do
}
const size_t _ASZ = 32; // array size
iterator _Ai[_ASZ]; // array of iterators to runs
iterator _Mi; // middle iterator
iterator _Li; // last iterator
size_t _I; // index to _Ai
for (_I = 0; _I < _ASZ; _I++) // "empty" array
_Ai[_I] = _Last; // _Ai[] == _Last => empty entry
// merge nodes into array
for (_Li = _First; _Li != _Last;) {
_Mi = _Li++;
for (_I = 0; (_I < _ASZ) && _Ai[_I] != _Last; _I++) {
_Mi = _Merge(_Pass_fn(_Pred), _Ai[_I], _Mi, _Li);
_Ai[_I] = _Last;
}
if (_I == _ASZ)
_I--;
_Ai[_I] = _Mi;
}
// merge array runs into single run
for (_I = 0; _I < _ASZ && _Ai[_I] == _Last; _I++);
_Mi = _Ai[_I++];
while (1) {
for (; _I < _ASZ && _Ai[_I] == _Last; _I++);
if (_I == _ASZ)
break;
_Mi = _Merge(_Pass_fn(_Pred), _Ai[_I++], _Mi, _Last);
}
}

Test and benchmark code, using include "listx.h" instead of <list>.

#include <ctime>
#include <iostream>
#include "listx.h" // copy of <list>

typedef unsigned long long uint64_t;

static clock_t ctTimeStart; // clock values
static clock_t ctTimeStop;

#define COUNT (4*1024*1024-1) // number of values to sort

int main(void)
{
uint64_t r = COUNT; // random number

std::list<uint64_t> ll(COUNT);
std::list<uint64_t>::iterator it;
// fill list with random values
for(it = ll.begin(); it != ll.end(); it++){
r = (((uint64_t)((rand()>>4) & 0xff)) << 0);
r += (((uint64_t)((rand()>>4) & 0xff)) << 8);
r += (((uint64_t)((rand()>>4) & 0xff)) <<16);
r += (((uint64_t)((rand()>>4) & 0xff)) <<24);
r += (((uint64_t)((rand()>>4) & 0xff)) <<32);
r += (((uint64_t)((rand()>>4) & 0xff)) <<40);
r += (((uint64_t)((rand()>>4) & 0xff)) <<48);
r += (((uint64_t)((rand()>>4) & 0xff)) <<56);
*it = r;
}

ctTimeStart = clock();
ll.sort();
ctTimeStop = clock();
std::cout << "# of ticks " << ctTimeStop - ctTimeStart << std::endl;

it = ll.begin();
r = *it;
while (++it != ll.end()) {
if (r > *it) {
std::cout << "error" << std::endl;
break;
}
r = *it;
}

// refill scrambled list with random values
for (it = ll.begin(); it != ll.end(); it++) {
r = (((uint64_t)((rand() >> 4) & 0xff)) << 0);
r += (((uint64_t)((rand() >> 4) & 0xff)) << 8);
r += (((uint64_t)((rand() >> 4) & 0xff)) << 16);
r += (((uint64_t)((rand() >> 4) & 0xff)) << 24);
r += (((uint64_t)((rand() >> 4) & 0xff)) << 32);
r += (((uint64_t)((rand() >> 4) & 0xff)) << 40);
r += (((uint64_t)((rand() >> 4) & 0xff)) << 48);
r += (((uint64_t)((rand() >> 4) & 0xff)) << 56);
*it = r;
}

ctTimeStart = clock();
ll.sort();
ctTimeStop = clock();
std::cout << "# of ticks " << ctTimeStop - ctTimeStart << std::endl;

it = ll.begin();
r = *it;
while (++it != ll.end()) {
if (r > *it) {
std::cout << "error" << std::endl;
break;
}
r = *it;
}

return(0);
}

Continue reading...
 
Back
Top