diff --git a/src/libutil-tests/meson.build b/src/libutil-tests/meson.build index f2552550d..b3776e094 100644 --- a/src/libutil-tests/meson.build +++ b/src/libutil-tests/meson.build @@ -65,6 +65,7 @@ sources = files( 'position.cc', 'processes.cc', 'references.cc', + 'sort.cc', 'spawn.cc', 'strings.cc', 'suggestions.cc', diff --git a/src/libutil-tests/sort.cc b/src/libutil-tests/sort.cc new file mode 100644 index 000000000..8eee961c8 --- /dev/null +++ b/src/libutil-tests/sort.cc @@ -0,0 +1,274 @@ +#include +#include +#include "nix/util/sort.hh" + +#include +#include +#include +#include + +namespace nix { + +struct MonotonicSubranges : public ::testing::Test +{ + std::vector empty_; + std::vector basic_ = {1, 0, -1, -100, 10, 10, 20, 40, 5, 5, 20, 10, 10, 1, -5}; +}; + +TEST_F(MonotonicSubranges, empty) +{ + ASSERT_EQ(weaklyIncreasingPrefix(empty_.begin(), empty_.end()), empty_.begin()); + ASSERT_EQ(weaklyIncreasingSuffix(empty_.begin(), empty_.end()), empty_.begin()); + ASSERT_EQ(strictlyDecreasingPrefix(empty_.begin(), empty_.end()), empty_.begin()); + ASSERT_EQ(strictlyDecreasingSuffix(empty_.begin(), empty_.end()), empty_.begin()); +} + +TEST_F(MonotonicSubranges, basic) +{ + ASSERT_EQ(strictlyDecreasingPrefix(basic_.begin(), basic_.end()), basic_.begin() + 4); + ASSERT_EQ(strictlyDecreasingSuffix(basic_.begin(), basic_.end()), basic_.begin() + 12); + std::reverse(basic_.begin(), basic_.end()); + ASSERT_EQ(weaklyIncreasingPrefix(basic_.begin(), basic_.end()), basic_.begin() + 5); + ASSERT_EQ(weaklyIncreasingSuffix(basic_.begin(), basic_.end()), basic_.begin() + 11); +} + +template +class SortTestPermutations : public ::testing::Test +{ + std::vector initialData = {std::numeric_limits::max(), std::numeric_limits::min(), 0, 0, 42, 126, 36}; + std::vector vectorData; + std::list listData; + +public: + std::vector scratchVector; + std::list scratchList; + std::vector empty; + + void SetUp() override + { + vectorData = initialData; + std::sort(vectorData.begin(), vectorData.end()); + listData = std::list(vectorData.begin(), vectorData.end()); + } + + bool nextPermutation() + { + std::next_permutation(vectorData.begin(), vectorData.end()); + std::next_permutation(listData.begin(), listData.end()); + scratchList = listData; + scratchVector = vectorData; + return vectorData == initialData; + } +}; + +using SortPermutationsTypes = ::testing::Types; + +TYPED_TEST_SUITE(SortTestPermutations, SortPermutationsTypes); + +TYPED_TEST(SortTestPermutations, insertionsort) +{ + while (!this->nextPermutation()) { + auto & list = this->scratchList; + insertionsort(list.begin(), list.end()); + ASSERT_TRUE(std::is_sorted(list.begin(), list.end())); + auto & vector = this->scratchVector; + insertionsort(vector.begin(), vector.end()); + ASSERT_TRUE(std::is_sorted(vector.begin(), vector.end())); + } +} + +TYPED_TEST(SortTestPermutations, peeksort) +{ + while (!this->nextPermutation()) { + auto & vector = this->scratchVector; + peeksort(vector.begin(), vector.end()); + ASSERT_TRUE(std::is_sorted(vector.begin(), vector.end())); + } +} + +TEST(InsertionSort, empty) +{ + std::vector empty; + insertionsort(empty.begin(), empty.end()); +} + +struct RandomPeekSort : public ::testing::TestWithParam< + std::tuple> +{ + using ValueType = int; + std::vector data_; + std::mt19937 urng_; + std::uniform_int_distribution distribution_; + + void SetUp() override + { + auto [maxSize, min, max, iterations] = GetParam(); + urng_ = std::mt19937(GTEST_FLAG_GET(random_seed)); + distribution_ = std::uniform_int_distribution(min, max); + } + + auto regenerate() + { + auto [maxSize, min, max, iterations] = GetParam(); + std::size_t dataSize = std::uniform_int_distribution(0, maxSize)(urng_); + data_.resize(dataSize); + std::generate(data_.begin(), data_.end(), [&]() { return distribution_(urng_); }); + } +}; + +TEST_P(RandomPeekSort, defaultComparator) +{ + auto [maxSize, min, max, iterations] = GetParam(); + + for (std::size_t i = 0; i < iterations; ++i) { + regenerate(); + peeksort(data_.begin(), data_.end()); + ASSERT_TRUE(std::is_sorted(data_.begin(), data_.end())); + /* Sorting is idempotent */ + peeksort(data_.begin(), data_.end()); + ASSERT_TRUE(std::is_sorted(data_.begin(), data_.end())); + } +} + +TEST_P(RandomPeekSort, greater) +{ + auto [maxSize, min, max, iterations] = GetParam(); + + for (std::size_t i = 0; i < iterations; ++i) { + regenerate(); + peeksort(data_.begin(), data_.end(), std::greater{}); + ASSERT_TRUE(std::is_sorted(data_.begin(), data_.end(), std::greater{})); + /* Sorting is idempotent */ + peeksort(data_.begin(), data_.end(), std::greater{}); + ASSERT_TRUE(std::is_sorted(data_.begin(), data_.end(), std::greater{})); + } +} + +TEST_P(RandomPeekSort, brokenComparator) +{ + auto [maxSize, min, max, iterations] = GetParam(); + + /* This is a pretty nice way of modeling a worst-case scenario for a broken comparator. + If the sorting algorithm doesn't break in such case, then surely all deterministic + predicates won't break it. */ + auto comp = [&]([[maybe_unused]] const auto & lhs, [[maybe_unused]] const auto & rhs) -> bool { + return std::uniform_int_distribution(0, 1)(urng_); + }; + + for (std::size_t i = 0; i < iterations; ++i) { + regenerate(); + auto originalData = data_; + peeksort(data_.begin(), data_.end(), comp); + /* Check that the output is just a reordering of the input. This is the + contract of the implementation in regard to comparators that don't + define a strict weak order. */ + std::sort(data_.begin(), data_.end()); + std::sort(originalData.begin(), originalData.end()); + ASSERT_EQ(originalData, data_); + } +} + +TEST_P(RandomPeekSort, stability) +{ + auto [maxSize, min, max, iterations] = GetParam(); + + for (std::size_t i = 0; i < iterations; ++i) { + regenerate(); + std::vector> pairs; + + /* Assign sequential ids to objects. After the sort ids for equivalent + elements should be in ascending order. */ + std::transform( + data_.begin(), data_.end(), std::back_inserter(pairs), [id = std::size_t{0}](auto && val) mutable { + return std::pair{val, ++id}; + }); + + auto comp = [&]([[maybe_unused]] const auto & lhs, [[maybe_unused]] const auto & rhs) -> bool { + return lhs.first > rhs.first; + }; + + peeksort(pairs.begin(), pairs.end(), comp); + ASSERT_TRUE(std::is_sorted(pairs.begin(), pairs.end(), comp)); + + for (auto begin = pairs.begin(), end = pairs.end(); begin < end; ++begin) { + auto key = begin->first; + auto innerEnd = std::find_if_not(begin, end, [key](const auto & lhs) { return lhs.first == key; }); + ASSERT_TRUE(std::is_sorted(begin, innerEnd, [](const auto & lhs, const auto & rhs) { + return lhs.second < rhs.second; + })); + begin = innerEnd; + } + } +} + +using RandomPeekSortParamType = RandomPeekSort::ParamType; + +INSTANTIATE_TEST_SUITE_P( + PeekSort, + RandomPeekSort, + ::testing::Values( + RandomPeekSortParamType{128, std::numeric_limits::min(), std::numeric_limits::max(), 1024}, + RandomPeekSortParamType{7753, -32, 32, 128}, + RandomPeekSortParamType{11719, std::numeric_limits::min(), std::numeric_limits::max(), 64}, + RandomPeekSortParamType{4063, 0, 32, 256}, + RandomPeekSortParamType{771, -8, 8, 2048}, + RandomPeekSortParamType{433, 0, 1, 2048}, + RandomPeekSortParamType{0, 0, 0, 1}, /* empty case */ + RandomPeekSortParamType{ + 1, std::numeric_limits::min(), std::numeric_limits::max(), 1}, /* single element */ + RandomPeekSortParamType{ + 2, std::numeric_limits::min(), std::numeric_limits::max(), 2}, /* two elements */ + RandomPeekSortParamType{55425, std::numeric_limits::min(), std::numeric_limits::max(), 128})); + +template +struct SortProperty : public ::testing::Test +{}; + +using SortPropertyTypes = ::testing::Types; +TYPED_TEST_SUITE(SortProperty, SortPropertyTypes); + +RC_GTEST_TYPED_FIXTURE_PROP(SortProperty, peeksortSorted, (std::vector vec)) +{ + peeksort(vec.begin(), vec.end()); + RC_ASSERT(std::is_sorted(vec.begin(), vec.end())); +} + +RC_GTEST_TYPED_FIXTURE_PROP(SortProperty, peeksortSortedGreater, (std::vector vec)) +{ + auto comp = std::greater(); + peeksort(vec.begin(), vec.end(), comp); + RC_ASSERT(std::is_sorted(vec.begin(), vec.end(), comp)); +} + +RC_GTEST_TYPED_FIXTURE_PROP(SortProperty, insertionsortSorted, (std::vector vec)) +{ + insertionsort(vec.begin(), vec.end()); + RC_ASSERT(std::is_sorted(vec.begin(), vec.end())); +} + +RC_GTEST_PROP(SortProperty, peeksortStability, (std::vector> vec)) +{ + auto comp = [](auto lhs, auto rhs) { return lhs.first < rhs.first; }; + auto copy = vec; + std::stable_sort(copy.begin(), copy.end(), comp); + peeksort(vec.begin(), vec.end(), comp); + RC_ASSERT(copy == vec); +} + +RC_GTEST_TYPED_FIXTURE_PROP(SortProperty, peeksortSortedLinearComparisonComplexity, (std::vector vec)) +{ + peeksort(vec.begin(), vec.end()); + RC_ASSERT(std::is_sorted(vec.begin(), vec.end())); + std::size_t comparisonCount = 0; + auto countingComp = [&](auto lhs, auto rhs) { + ++comparisonCount; + return lhs < rhs; + }; + + peeksort(vec.begin(), vec.end(), countingComp); + + /* In the sorted case comparison complexify should be linear. */ + RC_ASSERT(comparisonCount <= vec.size()); +} + +} // namespace nix diff --git a/src/libutil/include/nix/util/meson.build b/src/libutil/include/nix/util/meson.build index 6c95a6ced..22438c1d0 100644 --- a/src/libutil/include/nix/util/meson.build +++ b/src/libutil/include/nix/util/meson.build @@ -59,6 +59,7 @@ headers = files( 'signals.hh', 'signature/local-keys.hh', 'signature/signer.hh', + 'sort.hh', 'source-accessor.hh', 'source-path.hh', 'split.hh', diff --git a/src/libutil/include/nix/util/sort.hh b/src/libutil/include/nix/util/sort.hh new file mode 100644 index 000000000..0affdf3ce --- /dev/null +++ b/src/libutil/include/nix/util/sort.hh @@ -0,0 +1,299 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +/** + * @file + * + * In-house implementation of sorting algorithms. Used for cases when several properties + * need to be upheld regardless of the stdlib implementation of std::sort or + * std::stable_sort. + * + * PeekSort implementation is adapted from reference implementation + * https://github.com/sebawild/powersort licensed under the MIT License. + * + */ + +/* PeekSort attribution: + * + * MIT License + * + * Copyright (c) 2022 Sebastian Wild + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +namespace nix { + +/** + * Merge sorted runs [begin, middle) with [middle, end) in-place [begin, end). + * Uses a temporary working buffer by first copying [begin, end) to it. + * + * @param begin Start of the first subrange to be sorted. + * @param middle End of the first sorted subrange and the start of the second. + * @param end End of the second sorted subrange. + * @param workingBegin Start of the working buffer. + * @param comp Comparator implementing an operator()(const ValueType& lhs, const ValueType& rhs). + * + * @pre workingBegin buffer must have at least std::distance(begin, end) elements. + * + * @note We can't use std::inplace_merge or std::merge, because their behavior + * is undefined if the comparator is not strict weak ordering. + */ +template< + std::forward_iterator Iter, + std::random_access_iterator BufIter, + typename Comparator = std::less>> +void mergeSortedRunsInPlace(Iter begin, Iter middle, Iter end, BufIter workingBegin, Comparator comp = {}) +{ + const BufIter workingMiddle = std::move(begin, middle, workingBegin); + const BufIter workingEnd = std::move(middle, end, workingMiddle); + + Iter output = begin; + BufIter workingLeft = workingBegin; + BufIter workingRight = workingMiddle; + + while (workingLeft != workingMiddle && workingRight != workingEnd) { + /* Note the inversion here !comp(...., ....). This is required for the merge to be stable. + If a == b where a if from the left part and b is the the right, then we have to pick + a. */ + *output++ = !comp(*workingRight, *workingLeft) ? std::move(*workingLeft++) : std::move(*workingRight++); + } + + std::move(workingLeft, workingMiddle, output); + std::move(workingRight, workingEnd, output); +} + +/** + * Simple insertion sort. + * + * Does not require that the std::iter_value_t is copyable. + * + * @param begin Start of the range to sort. + * @param end End of the range to sort. + * @comp Comparator the defines the ordering. Order of elements if the comp is not strict weak ordering + * is not specified. + * @throws Nothing. + * + * Note on exception safety: this function provides weak exception safety + * guarantees. To elaborate: if the comparator throws or move assignment + * throws (value type is not nothrow_move_assignable) then the range is left in + * a consistent, but unspecified state. + * + * @note This can't be implemented in terms of binary search if the strict weak ordering + * needs to be handled in a well-defined but unspecified manner. + */ +template>> +void insertionsort(Iter begin, Iter end, Comparator comp = {}) +{ + if (begin == end) + return; + for (Iter current = std::next(begin); current != end; ++current) { + for (Iter insertionPoint = current; + insertionPoint != begin && comp(*insertionPoint, *std::prev(insertionPoint)); + --insertionPoint) { + std::swap(*insertionPoint, *std::prev(insertionPoint)); + } + } +} + +/** + * Find maximal i <= end such that [begin, i) is strictly decreasing according + * to the specified comparator. + */ +template>> +Iter strictlyDecreasingPrefix(Iter begin, Iter end, Comparator && comp = {}) +{ + if (begin == end) + return begin; + while (std::next(begin) != end && /* *std::next(begin) < begin */ + comp(*std::next(begin), *begin)) + ++begin; + return std::next(begin); +} + +/** + * Find minimal i >= start such that [i, end) is strictly decreasing according + * to the specified comparator. + */ +template>> +Iter strictlyDecreasingSuffix(Iter begin, Iter end, Comparator && comp = {}) +{ + if (begin == end) + return end; + while (std::prev(end) > begin && /* *std::prev(end) < *std::prev(end, 2) */ + comp(*std::prev(end), *std::prev(end, 2))) + --end; + return std::prev(end); +} + +/** + * Find maximal i <= end such that [begin, i) is weakly increasing according + * to the specified comparator. + */ +template>> +Iter weaklyIncreasingPrefix(Iter begin, Iter end, Comparator && comp = {}) +{ + return strictlyDecreasingPrefix(begin, end, std::not_fn(std::forward(comp))); +} + +/** + * Find minimal i >= start such that [i, end) is weakly increasing according + * to the specified comparator. + */ +template>> +Iter weaklyIncreasingSuffix(Iter begin, Iter end, Comparator && comp = {}) +{ + return strictlyDecreasingSuffix(begin, end, std::not_fn(std::forward(comp))); +} + +/** + * Peeksort stable sorting algorithm. Sorts elements in-place. + * Allocates additional memory as needed. + * + * @details + * PeekSort is a stable, near-optimal natural mergesort. Most importantly, like any + * other mergesort it upholds the "Ord safety" property. Meaning that even for + * comparator predicates that don't satisfy strict weak ordering it can't result + * in infinite loops/out of bounds memory accesses or other undefined behavior. + * + * As a quick reminder, strict weak ordering relation operator< must satisfy + * the following properties. Keep in mind that in C++ an equvalence relation + * is specified in terms of operator< like so: a ~ b iff !(a < b) && !(b < a). + * + * 1. a < a === false - relation is irreflexive + * 2. a < b, b < c => a < c - transitivity + * 3. a ~ b, a ~ b, b ~ c => a ~ c, transitivity of equivalence + * + * @see https://www.wild-inter.net/publications/munro-wild-2018 + * @see https://github.com/Voultapher/sort-research-rs/blob/main/writeup/sort_safety/text.md#property-analysis + * + * The order of elements when comp is not strict weak ordering is not specified, but + * is not undefined. The output is always some permutation of the input, regardless + * of the comparator provided. + * Relying on ordering in such cases is erroneous, but this implementation + * will happily accept broken comparators and will not crash. + * + * @param begin Start of the range to be sorted. + * @param end End of the range to be sorted. + * @comp comp Comparator implementing an operator()(const ValueType& lhs, const ValueType& rhs). + * + * @throws std::bad_alloc if the temporary buffer can't be allocated. + * + * @return Nothing. + * + * Note on exception safety: this function provides weak exception safety + * guarantees. To elaborate: if the comparator throws or move assignment + * throws (value type is not nothrow_move_assignable) then the range is left in + * a consistent, but unspecified state. + * + */ +template>> +/* ValueType must be default constructible to create the temporary buffer */ + requires std::is_default_constructible_v> +void peeksort(Iter begin, Iter end, Comparator comp = {}) +{ + auto length = std::distance(begin, end); + + /* Special-case very simple inputs. This is identical to how libc++ does it. */ + switch (length) { + case 0: + [[fallthrough]]; + case 1: + return; + case 2: + if (comp(*--end, *begin)) /* [a, b], b < a */ + std::swap(*begin, *end); + return; + } + + using ValueType = std::iter_value_t; + auto workingBuffer = std::vector(length); + + /* + * sorts [begin, end), assuming that [begin, leftRunEnd) and + * [rightRunBegin, end) are sorted. + * Modified implementation from: + * https://github.com/sebawild/powersort/blob/1d078b6be9023e134c4f8f6de88e2406dc681e89/src/sorts/peeksort.h + */ + auto peeksortImpl = [&workingBuffer, + &comp](auto & peeksortImpl, Iter begin, Iter end, Iter leftRunEnd, Iter rightRunBegin) { + if (leftRunEnd == end || rightRunBegin == begin) + return; + + /* Dispatch to simpler insertion sort implementation for smaller cases + Cut-off limit is the same as in libstdc++ + https://github.com/gcc-mirror/gcc/blob/d9375e490072d1aae73a93949aa158fcd2a27018/libstdc%2B%2B-v3/include/bits/stl_algo.h#L4977 + */ + static constexpr std::size_t insertionsortThreshold = 16; + size_t length = std::distance(begin, end); + if (length <= insertionsortThreshold) + return insertionsort(begin, end, comp); + + Iter middle = std::next(begin, (length / 2)); /* Middle split between m and m - 1 */ + + if (middle <= leftRunEnd) { + /* |XXXXXXXX|XX X| */ + peeksortImpl(peeksortImpl, leftRunEnd, end, std::next(leftRunEnd), rightRunBegin); + mergeSortedRunsInPlace(begin, leftRunEnd, end, workingBuffer.begin(), comp); + return; + } else if (middle >= rightRunBegin) { + /* |XX X|XXXXXXXX| */ + peeksortImpl(peeksortImpl, begin, rightRunBegin, leftRunEnd, std::prev(rightRunBegin)); + mergeSortedRunsInPlace(begin, rightRunBegin, end, workingBuffer.begin(), comp); + return; + } + + /* Find middle run, i.e., run containing m - 1 */ + Iter i, j; + + if (!comp(*middle, *std::prev(middle)) /* *std::prev(middle) <= *middle */) { + i = weaklyIncreasingSuffix(leftRunEnd, middle, comp); + j = weaklyIncreasingPrefix(std::prev(middle), rightRunBegin, comp); + } else { + i = strictlyDecreasingSuffix(leftRunEnd, middle, comp); + j = strictlyDecreasingPrefix(std::prev(middle), rightRunBegin, comp); + std::reverse(i, j); + } + + if (i == begin && j == end) + return; /* single run */ + + if (middle - i < j - middle) { + /* |XX x|xxxx X| */ + peeksortImpl(peeksortImpl, begin, i, leftRunEnd, std::prev(i)); + peeksortImpl(peeksortImpl, i, end, j, rightRunBegin); + mergeSortedRunsInPlace(begin, i, end, workingBuffer.begin(), comp); + } else { + /* |XX xxx|x X| */ + peeksortImpl(peeksortImpl, begin, j, leftRunEnd, i); + peeksortImpl(peeksortImpl, j, end, std::next(j), rightRunBegin); + mergeSortedRunsInPlace(begin, j, end, workingBuffer.begin(), comp); + } + }; + + peeksortImpl(peeksortImpl, begin, end, /*leftRunEnd=*/begin, /*rightRunBegin=*/end); +} + +}