From 94bbaddb93988352645e9b4bb4bec6c5ed730747 Mon Sep 17 00:00:00 2001 From: Philipp Otterbein Date: Sun, 18 May 2025 03:53:50 +0200 Subject: [PATCH 1/2] symbol-table: reduce memory usage and use boost::unordered_flat_set --- src/libexpr/eval.cc | 8 +- .../include/nix/expr/print-ambiguous.hh | 1 + src/libexpr/include/nix/expr/symbol-table.hh | 240 ++++++++++++++---- src/libexpr/include/nix/expr/value.hh | 7 +- src/libexpr/nixexpr.cc | 2 +- 5 files changed, 199 insertions(+), 59 deletions(-) diff --git a/src/libexpr/eval.cc b/src/libexpr/eval.cc index 1a067e75c..123f0e86a 100644 --- a/src/libexpr/eval.cc +++ b/src/libexpr/eval.cc @@ -2,6 +2,7 @@ #include "nix/expr/eval-settings.hh" #include "nix/expr/primops.hh" #include "nix/expr/print-options.hh" +#include "nix/expr/symbol-table.hh" #include "nix/util/exit.hh" #include "nix/util/types.hh" #include "nix/util/util.hh" @@ -918,6 +919,11 @@ void Value::mkStringMove(const char * s, const NixStringContext & context) mkString(s, encodeContext(context)); } +void Value::mkString(const SymbolStr & s) +{ + mkString(s.c_str(), nullptr); +} + void Value::mkPath(const SourcePath & path) { mkPath(&*path.accessor, makeImmutableString(path.path.abs())); @@ -3019,7 +3025,7 @@ void EvalState::printStatistics() // XXX: overrides earlier assignment topObj["symbols"] = json::array(); auto &list = topObj["symbols"]; - symbols.dump([&](const std::string & s) { list.emplace_back(s); }); + symbols.dump([&](std::string_view s) { list.emplace_back(s); }); } if (outPath == "-") { std::cerr << topObj.dump(2) << std::endl; diff --git a/src/libexpr/include/nix/expr/print-ambiguous.hh b/src/libexpr/include/nix/expr/print-ambiguous.hh index 09a849c49..9e5a27e6d 100644 --- a/src/libexpr/include/nix/expr/print-ambiguous.hh +++ b/src/libexpr/include/nix/expr/print-ambiguous.hh @@ -1,6 +1,7 @@ #pragma once #include "nix/expr/value.hh" +#include "nix/expr/symbol-table.hh" namespace nix { diff --git a/src/libexpr/include/nix/expr/symbol-table.hh b/src/libexpr/include/nix/expr/symbol-table.hh index c04cc041b..994bddd6c 100644 --- a/src/libexpr/include/nix/expr/symbol-table.hh +++ b/src/libexpr/include/nix/expr/symbol-table.hh @@ -1,51 +1,35 @@ #pragma once ///@file -#include -#include -#include - -#include "nix/util/types.hh" +#include +#include "nix/expr/value.hh" #include "nix/util/chunked-vector.hh" #include "nix/util/error.hh" +#include +#define USE_FLAT_SYMBOL_SET (BOOST_VERSION >= 108100) +#if USE_FLAT_SYMBOL_SET +# include +#else +# include +#endif + namespace nix { -/** - * This class mainly exists to give us an operator<< for ostreams. We could also - * return plain strings from SymbolTable, but then we'd have to wrap every - * instance of a symbol that is fmt()ed, which is inconvenient and error-prone. - */ -class SymbolStr +class SymbolValue : protected Value { + friend class SymbolStr; friend class SymbolTable; -private: - const std::string * s; + uint32_t size_; + uint32_t idx; - explicit SymbolStr(const std::string & symbol): s(&symbol) {} + SymbolValue() = default; public: - bool operator == (std::string_view s2) const + operator std::string_view() const noexcept { - return *s == s2; - } - - const char * c_str() const - { - return s->c_str(); - } - - operator const std::string_view () const - { - return *s; - } - - friend std::ostream & operator <<(std::ostream & os, const SymbolStr & symbol); - - bool empty() const - { - return s->empty(); + return {payload.string.c_str, size_}; } }; @@ -56,24 +40,155 @@ public: */ class Symbol { + friend class SymbolStr; friend class SymbolTable; private: uint32_t id; - explicit Symbol(uint32_t id): id(id) {} + explicit Symbol(uint32_t id) noexcept : id(id) {} public: - Symbol() : id(0) {} + Symbol() noexcept : id(0) {} - explicit operator bool() const { return id > 0; } + [[gnu::always_inline]] + explicit operator bool() const noexcept { return id > 0; } - auto operator<=>(const Symbol other) const { return id <=> other.id; } - bool operator==(const Symbol other) const { return id == other.id; } + auto operator<=>(const Symbol other) const noexcept { return id <=> other.id; } + bool operator==(const Symbol other) const noexcept { return id == other.id; } friend class std::hash; }; +/** + * This class mainly exists to give us an operator<< for ostreams. We could also + * return plain strings from SymbolTable, but then we'd have to wrap every + * instance of a symbol that is fmt()ed, which is inconvenient and error-prone. + */ +class SymbolStr +{ + friend class SymbolTable; + + constexpr static size_t chunkSize{8192}; + using SymbolValueStore = ChunkedVector; + + const SymbolValue * s; + + struct Key + { + using HashType = boost::hash; + + SymbolValueStore & store; + std::string_view s; + std::size_t hash; + std::pmr::polymorphic_allocator & alloc; + + Key(SymbolValueStore & store, std::string_view s, std::pmr::polymorphic_allocator & stringAlloc) + : store(store) + , s(s) + , hash(HashType{}(s)) + , alloc(stringAlloc) {} + }; + +public: + SymbolStr(const SymbolValue & s) noexcept : s(&s) {} + + SymbolStr(const Key & key) + { + auto size = key.s.size(); + if (size >= std::numeric_limits::max()) { + throw Error("Size of symbol exceeds 4GiB and cannot be stored"); + } + // for multi-threaded implementations: lock store and allocator here + const auto & [v, idx] = key.store.add(SymbolValue{}); + if (size == 0) { + v.mkString("", nullptr); + } else { + auto s = key.alloc.allocate(size + 1); + memcpy(s, key.s.data(), size); + s[size] = '\0'; + v.mkString(s, nullptr); + } + v.size_ = size; + v.idx = idx; + this->s = &v; + } + + bool operator == (std::string_view s2) const noexcept + { + return *s == s2; + } + + [[gnu::always_inline]] + const char * c_str() const noexcept + { + return s->payload.string.c_str; + } + + [[gnu::always_inline]] + operator std::string_view () const noexcept + { + return *s; + } + + friend std::ostream & operator <<(std::ostream & os, const SymbolStr & symbol); + + [[gnu::always_inline]] + bool empty() const noexcept + { + return s->size_ == 0; + } + + [[gnu::always_inline]] + size_t size() const noexcept + { + return s->size_; + } + + explicit operator Symbol() const noexcept + { + return Symbol{s->idx + 1}; + } + + struct Hash + { + using is_transparent = void; + using is_avalanching = std::true_type; + + std::size_t operator()(SymbolStr str) const + { + return Key::HashType{}(*str.s); + } + + std::size_t operator()(const Key & key) const noexcept + { + return key.hash; + } + }; + + struct Equal + { + using is_transparent = void; + + bool operator()(SymbolStr a, SymbolStr b) const noexcept + { + // strings are unique, so that a pointer comparison is OK + return a.s == b.s; + } + + bool operator()(SymbolStr a, const Key & b) const noexcept + { + return a == b.s; + } + + [[gnu::always_inline]] + bool operator()(const Key & a, SymbolStr b) const noexcept + { + return operator()(b, a); + } + }; +}; + /** * Symbol table used by the parser and evaluator to represent and look * up identifiers and attributes efficiently. @@ -82,29 +197,46 @@ class SymbolTable { private: /** - * Map from string view (backed by ChunkedVector) -> offset into the store. + * SymbolTable is an append only data structure. + * During its lifetime the monotonic buffer holds all strings and nodes, if the symbol set is node based. + */ + std::pmr::monotonic_buffer_resource buffer; + std::pmr::polymorphic_allocator stringAlloc{&buffer}; + SymbolStr::SymbolValueStore store{16}; + + /** + * Transparent lookup of string view for a pointer to a ChunkedVector entry -> return offset into the store. * ChunkedVector references are never invalidated. */ - std::unordered_map symbols; - ChunkedVector store{16}; +#if USE_FLAT_SYMBOL_SET + boost::unordered_flat_set symbols{SymbolStr::chunkSize}; +#else + using SymbolValueAlloc = std::pmr::polymorphic_allocator; + boost::unordered_set symbols{SymbolStr::chunkSize, {&buffer}}; +#endif public: /** * Converts a string into a symbol. */ - Symbol create(std::string_view s) - { + Symbol create(std::string_view s) { // Most symbols are looked up more than once, so we trade off insertion performance // for lookup performance. // FIXME: make this thread-safe. - auto it = symbols.find(s); - if (it != symbols.end()) - return Symbol(it->second + 1); + return [&](T && key) -> Symbol { + if constexpr (requires { symbols.insert(key); }) { + auto [it, _] = symbols.insert(key); + return Symbol(*it); + } else { + auto it = symbols.find(key); + if (it != symbols.end()) + return Symbol(*it); - const auto & [rawSym, idx] = store.add(s); - symbols.emplace(rawSym, idx); - return Symbol(idx + 1); + it = symbols.emplace(key).first; + return Symbol(*it); + } + }(SymbolStr::Key{store, s, stringAlloc}); } std::vector resolve(const std::vector & symbols) const @@ -118,12 +250,14 @@ public: SymbolStr operator[](Symbol s) const { - if (s.id == 0 || s.id > store.size()) + uint32_t idx = s.id - uint32_t(1); + if (idx >= store.size()) unreachable(); - return SymbolStr(store[s.id - 1]); + return store[idx]; } - size_t size() const + [[gnu::always_inline]] + size_t size() const noexcept { return store.size(); } @@ -147,3 +281,5 @@ struct std::hash return std::hash{}(s.id); } }; + +#undef USE_FLAT_SYMBOL_SET diff --git a/src/libexpr/include/nix/expr/value.hh b/src/libexpr/include/nix/expr/value.hh index e9cc1cd3f..3a96577a2 100644 --- a/src/libexpr/include/nix/expr/value.hh +++ b/src/libexpr/include/nix/expr/value.hh @@ -5,7 +5,6 @@ #include #include "nix/expr/eval-gc.hh" -#include "nix/expr/symbol-table.hh" #include "nix/expr/value/context.hh" #include "nix/util/source-path.hh" #include "nix/expr/print-options.hh" @@ -65,6 +64,7 @@ struct ExprLambda; struct ExprBlackHole; struct PrimOp; class Symbol; +class SymbolStr; class PosIdx; struct Pos; class StorePath; @@ -331,10 +331,7 @@ public: void mkStringMove(const char * s, const NixStringContext & context); - inline void mkString(const SymbolStr & s) - { - mkString(s.c_str()); - } + void mkString(const SymbolStr & s); void mkPath(const SourcePath & path); void mkPath(std::string_view path); diff --git a/src/libexpr/nixexpr.cc b/src/libexpr/nixexpr.cc index 1a71096d4..92071b22d 100644 --- a/src/libexpr/nixexpr.cc +++ b/src/libexpr/nixexpr.cc @@ -606,7 +606,7 @@ void ExprLambda::setDocComment(DocComment docComment) { size_t SymbolTable::totalSize() const { size_t n = 0; - dump([&] (const std::string & s) { n += s.size(); }); + dump([&] (SymbolStr s) { n += s.size(); }); return n; } From ed4e512dcd4969836663611c57b6e9903c48c5b2 Mon Sep 17 00:00:00 2001 From: Philipp Otterbein Date: Sun, 1 Jun 2025 21:00:01 +0200 Subject: [PATCH 2/2] symbol-table: reference entries instead of allocating `Value`s --- src/libexpr/eval.cc | 10 +++++----- src/libexpr/include/nix/expr/symbol-table.hh | 6 ++++++ src/libexpr/include/nix/expr/value.hh | 7 +++++-- src/libexpr/primops.cc | 8 +++----- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/libexpr/eval.cc b/src/libexpr/eval.cc index 123f0e86a..d0069b321 100644 --- a/src/libexpr/eval.cc +++ b/src/libexpr/eval.cc @@ -90,6 +90,11 @@ std::string printValue(EvalState & state, Value & v) return out.str(); } +Value * Value::toPtr(SymbolStr str) noexcept +{ + return const_cast(str.valuePtr()); +} + void Value::print(EvalState & state, std::ostream & str, PrintOptions options) { printValue(state, str, *this, options); @@ -919,11 +924,6 @@ void Value::mkStringMove(const char * s, const NixStringContext & context) mkString(s, encodeContext(context)); } -void Value::mkString(const SymbolStr & s) -{ - mkString(s.c_str(), nullptr); -} - void Value::mkPath(const SourcePath & path) { mkPath(&*path.accessor, makeImmutableString(path.path.abs())); diff --git a/src/libexpr/include/nix/expr/symbol-table.hh b/src/libexpr/include/nix/expr/symbol-table.hh index 994bddd6c..1bf5b5543 100644 --- a/src/libexpr/include/nix/expr/symbol-table.hh +++ b/src/libexpr/include/nix/expr/symbol-table.hh @@ -145,6 +145,12 @@ public: return s->size_; } + [[gnu::always_inline]] + const Value * valuePtr() const noexcept + { + return s; + } + explicit operator Symbol() const noexcept { return Symbol{s->idx + 1}; diff --git a/src/libexpr/include/nix/expr/value.hh b/src/libexpr/include/nix/expr/value.hh index 3a96577a2..febe36f80 100644 --- a/src/libexpr/include/nix/expr/value.hh +++ b/src/libexpr/include/nix/expr/value.hh @@ -172,6 +172,11 @@ private: public: + /** + * Never modify the backing `Value` object! + */ + static Value * toPtr(SymbolStr str) noexcept; + void print(EvalState &state, std::ostream &str, PrintOptions options = PrintOptions {}); // Functions needed to distinguish the type @@ -331,8 +336,6 @@ public: void mkStringMove(const char * s, const NixStringContext & context); - void mkString(const SymbolStr & s); - void mkPath(const SourcePath & path); void mkPath(std::string_view path); diff --git a/src/libexpr/primops.cc b/src/libexpr/primops.cc index c4e6feb28..3017d259a 100644 --- a/src/libexpr/primops.cc +++ b/src/libexpr/primops.cc @@ -2710,7 +2710,7 @@ static void prim_attrNames(EvalState & state, const PosIdx pos, Value * * args, auto list = state.buildList(args[0]->attrs()->size()); for (const auto & [n, i] : enumerate(*args[0]->attrs())) - (list[n] = state.allocValue())->mkString(state.symbols[i.name]); + list[n] = Value::toPtr(state.symbols[i.name]); std::sort(list.begin(), list.end(), [](Value * v1, Value * v2) { return strcmp(v1->c_str(), v2->c_str()) < 0; }); @@ -3170,9 +3170,8 @@ static void prim_mapAttrs(EvalState & state, const PosIdx pos, Value * * args, V auto attrs = state.buildBindings(args[1]->attrs()->size()); for (auto & i : *args[1]->attrs()) { - Value * vName = state.allocValue(); + Value * vName = Value::toPtr(state.symbols[i.name]); Value * vFun2 = state.allocValue(); - vName->mkString(state.symbols[i.name]); vFun2->mkApp(args[0], vName); attrs.alloc(i.name).mkApp(vFun2, i.value); } @@ -3236,8 +3235,7 @@ static void prim_zipAttrsWith(EvalState & state, const PosIdx pos, Value * * arg auto attrs = state.buildBindings(attrsSeen.size()); for (auto & [sym, elem] : attrsSeen) { - auto name = state.allocValue(); - name->mkString(state.symbols[sym]); + auto name = Value::toPtr(state.symbols[sym]); auto call1 = state.allocValue(); call1->mkApp(args[0], name); auto call2 = state.allocValue();