1
0
Fork 0
mirror of https://github.com/NixOS/nix synced 2025-06-24 22:11:15 +02:00

Merge pull request #13258 from NaN-git/opt-symbol-table

Optimize symbol table
This commit is contained in:
Robert Hensing 2025-06-07 13:16:14 +02:00 committed by GitHub
commit 102259898c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 212 additions and 65 deletions

View file

@ -2,6 +2,7 @@
#include "nix/expr/eval-settings.hh"
#include "nix/expr/primops.hh"
#include "nix/expr/print-options.hh"
#include "nix/expr/symbol-table.hh"
#include "nix/util/exit.hh"
#include "nix/util/types.hh"
#include "nix/util/util.hh"
@ -89,6 +90,11 @@ std::string printValue(EvalState & state, Value & v)
return out.str();
}
Value * Value::toPtr(SymbolStr str) noexcept
{
return const_cast<Value *>(str.valuePtr());
}
void Value::print(EvalState & state, std::ostream & str, PrintOptions options)
{
printValue(state, str, *this, options);
@ -3013,7 +3019,7 @@ void EvalState::printStatistics()
// XXX: overrides earlier assignment
topObj["symbols"] = json::array();
auto &list = topObj["symbols"];
symbols.dump([&](const std::string & s) { list.emplace_back(s); });
symbols.dump([&](std::string_view s) { list.emplace_back(s); });
}
if (outPath == "-") {
std::cerr << topObj.dump(2) << std::endl;

View file

@ -1,6 +1,7 @@
#pragma once
#include "nix/expr/value.hh"
#include "nix/expr/symbol-table.hh"
namespace nix {

View file

@ -1,51 +1,35 @@
#pragma once
///@file
#include <list>
#include <map>
#include <unordered_map>
#include "nix/util/types.hh"
#include <memory_resource>
#include "nix/expr/value.hh"
#include "nix/util/chunked-vector.hh"
#include "nix/util/error.hh"
#include <boost/version.hpp>
#define USE_FLAT_SYMBOL_SET (BOOST_VERSION >= 108100)
#if USE_FLAT_SYMBOL_SET
# include <boost/unordered/unordered_flat_set.hpp>
#else
# include <boost/unordered/unordered_set.hpp>
#endif
namespace nix {
/**
* This class mainly exists to give us an operator<< for ostreams. We could also
* return plain strings from SymbolTable, but then we'd have to wrap every
* instance of a symbol that is fmt()ed, which is inconvenient and error-prone.
*/
class SymbolStr
class SymbolValue : protected Value
{
friend class SymbolStr;
friend class SymbolTable;
private:
const std::string * s;
uint32_t size_;
uint32_t idx;
explicit SymbolStr(const std::string & symbol): s(&symbol) {}
SymbolValue() = default;
public:
bool operator == (std::string_view s2) const
operator std::string_view() const noexcept
{
return *s == s2;
}
const char * c_str() const
{
return s->c_str();
}
operator const std::string_view () const
{
return *s;
}
friend std::ostream & operator <<(std::ostream & os, const SymbolStr & symbol);
bool empty() const
{
return s->empty();
return {payload.string.c_str, size_};
}
};
@ -56,24 +40,161 @@ public:
*/
class Symbol
{
friend class SymbolStr;
friend class SymbolTable;
private:
uint32_t id;
explicit Symbol(uint32_t id): id(id) {}
explicit Symbol(uint32_t id) noexcept : id(id) {}
public:
Symbol() : id(0) {}
Symbol() noexcept : id(0) {}
explicit operator bool() const { return id > 0; }
[[gnu::always_inline]]
explicit operator bool() const noexcept { return id > 0; }
auto operator<=>(const Symbol other) const { return id <=> other.id; }
bool operator==(const Symbol other) const { return id == other.id; }
auto operator<=>(const Symbol other) const noexcept { return id <=> other.id; }
bool operator==(const Symbol other) const noexcept { return id == other.id; }
friend class std::hash<Symbol>;
};
/**
* This class mainly exists to give us an operator<< for ostreams. We could also
* return plain strings from SymbolTable, but then we'd have to wrap every
* instance of a symbol that is fmt()ed, which is inconvenient and error-prone.
*/
class SymbolStr
{
friend class SymbolTable;
constexpr static size_t chunkSize{8192};
using SymbolValueStore = ChunkedVector<SymbolValue, chunkSize>;
const SymbolValue * s;
struct Key
{
using HashType = boost::hash<std::string_view>;
SymbolValueStore & store;
std::string_view s;
std::size_t hash;
std::pmr::polymorphic_allocator<char> & alloc;
Key(SymbolValueStore & store, std::string_view s, std::pmr::polymorphic_allocator<char> & stringAlloc)
: store(store)
, s(s)
, hash(HashType{}(s))
, alloc(stringAlloc) {}
};
public:
SymbolStr(const SymbolValue & s) noexcept : s(&s) {}
SymbolStr(const Key & key)
{
auto size = key.s.size();
if (size >= std::numeric_limits<uint32_t>::max()) {
throw Error("Size of symbol exceeds 4GiB and cannot be stored");
}
// for multi-threaded implementations: lock store and allocator here
const auto & [v, idx] = key.store.add(SymbolValue{});
if (size == 0) {
v.mkString("", nullptr);
} else {
auto s = key.alloc.allocate(size + 1);
memcpy(s, key.s.data(), size);
s[size] = '\0';
v.mkString(s, nullptr);
}
v.size_ = size;
v.idx = idx;
this->s = &v;
}
bool operator == (std::string_view s2) const noexcept
{
return *s == s2;
}
[[gnu::always_inline]]
const char * c_str() const noexcept
{
return s->payload.string.c_str;
}
[[gnu::always_inline]]
operator std::string_view () const noexcept
{
return *s;
}
friend std::ostream & operator <<(std::ostream & os, const SymbolStr & symbol);
[[gnu::always_inline]]
bool empty() const noexcept
{
return s->size_ == 0;
}
[[gnu::always_inline]]
size_t size() const noexcept
{
return s->size_;
}
[[gnu::always_inline]]
const Value * valuePtr() const noexcept
{
return s;
}
explicit operator Symbol() const noexcept
{
return Symbol{s->idx + 1};
}
struct Hash
{
using is_transparent = void;
using is_avalanching = std::true_type;
std::size_t operator()(SymbolStr str) const
{
return Key::HashType{}(*str.s);
}
std::size_t operator()(const Key & key) const noexcept
{
return key.hash;
}
};
struct Equal
{
using is_transparent = void;
bool operator()(SymbolStr a, SymbolStr b) const noexcept
{
// strings are unique, so that a pointer comparison is OK
return a.s == b.s;
}
bool operator()(SymbolStr a, const Key & b) const noexcept
{
return a == b.s;
}
[[gnu::always_inline]]
bool operator()(const Key & a, SymbolStr b) const noexcept
{
return operator()(b, a);
}
};
};
/**
* Symbol table used by the parser and evaluator to represent and look
* up identifiers and attributes efficiently.
@ -82,29 +203,46 @@ class SymbolTable
{
private:
/**
* Map from string view (backed by ChunkedVector) -> offset into the store.
* SymbolTable is an append only data structure.
* During its lifetime the monotonic buffer holds all strings and nodes, if the symbol set is node based.
*/
std::pmr::monotonic_buffer_resource buffer;
std::pmr::polymorphic_allocator<char> stringAlloc{&buffer};
SymbolStr::SymbolValueStore store{16};
/**
* Transparent lookup of string view for a pointer to a ChunkedVector entry -> return offset into the store.
* ChunkedVector references are never invalidated.
*/
std::unordered_map<std::string_view, uint32_t> symbols;
ChunkedVector<std::string, 8192> store{16};
#if USE_FLAT_SYMBOL_SET
boost::unordered_flat_set<SymbolStr, SymbolStr::Hash, SymbolStr::Equal> symbols{SymbolStr::chunkSize};
#else
using SymbolValueAlloc = std::pmr::polymorphic_allocator<SymbolStr>;
boost::unordered_set<SymbolStr, SymbolStr::Hash, SymbolStr::Equal, SymbolValueAlloc> symbols{SymbolStr::chunkSize, {&buffer}};
#endif
public:
/**
* Converts a string into a symbol.
*/
Symbol create(std::string_view s)
{
Symbol create(std::string_view s) {
// Most symbols are looked up more than once, so we trade off insertion performance
// for lookup performance.
// FIXME: make this thread-safe.
auto it = symbols.find(s);
if (it != symbols.end())
return Symbol(it->second + 1);
return [&]<typename T>(T && key) -> Symbol {
if constexpr (requires { symbols.insert<T>(key); }) {
auto [it, _] = symbols.insert<T>(key);
return Symbol(*it);
} else {
auto it = symbols.find<T>(key);
if (it != symbols.end())
return Symbol(*it);
const auto & [rawSym, idx] = store.add(s);
symbols.emplace(rawSym, idx);
return Symbol(idx + 1);
it = symbols.emplace(key).first;
return Symbol(*it);
}
}(SymbolStr::Key{store, s, stringAlloc});
}
std::vector<SymbolStr> resolve(const std::vector<Symbol> & symbols) const
@ -118,12 +256,14 @@ public:
SymbolStr operator[](Symbol s) const
{
if (s.id == 0 || s.id > store.size())
uint32_t idx = s.id - uint32_t(1);
if (idx >= store.size())
unreachable();
return SymbolStr(store[s.id - 1]);
return store[idx];
}
size_t size() const
[[gnu::always_inline]]
size_t size() const noexcept
{
return store.size();
}
@ -147,3 +287,5 @@ struct std::hash<nix::Symbol>
return std::hash<decltype(s.id)>{}(s.id);
}
};
#undef USE_FLAT_SYMBOL_SET

View file

@ -5,7 +5,6 @@
#include <span>
#include "nix/expr/eval-gc.hh"
#include "nix/expr/symbol-table.hh"
#include "nix/expr/value/context.hh"
#include "nix/util/source-path.hh"
#include "nix/expr/print-options.hh"
@ -65,6 +64,7 @@ struct ExprLambda;
struct ExprBlackHole;
struct PrimOp;
class Symbol;
class SymbolStr;
class PosIdx;
struct Pos;
class StorePath;
@ -172,6 +172,11 @@ private:
public:
/**
* Never modify the backing `Value` object!
*/
static Value * toPtr(SymbolStr str) noexcept;
void print(EvalState &state, std::ostream &str, PrintOptions options = PrintOptions {});
// Functions needed to distinguish the type
@ -331,11 +336,6 @@ public:
void mkStringMove(const char * s, const NixStringContext & context);
inline void mkString(const SymbolStr & s)
{
mkString(s.c_str());
}
void mkPath(const SourcePath & path);
void mkPath(std::string_view path);

View file

@ -606,7 +606,7 @@ void ExprLambda::setDocComment(DocComment docComment) {
size_t SymbolTable::totalSize() const
{
size_t n = 0;
dump([&] (const std::string & s) { n += s.size(); });
dump([&] (SymbolStr s) { n += s.size(); });
return n;
}

View file

@ -2720,7 +2720,7 @@ static void prim_attrNames(EvalState & state, const PosIdx pos, Value * * args,
auto list = state.buildList(args[0]->attrs()->size());
for (const auto & [n, i] : enumerate(*args[0]->attrs()))
(list[n] = state.allocValue())->mkString(state.symbols[i.name]);
list[n] = Value::toPtr(state.symbols[i.name]);
std::sort(list.begin(), list.end(),
[](Value * v1, Value * v2) { return strcmp(v1->c_str(), v2->c_str()) < 0; });
@ -3180,9 +3180,8 @@ static void prim_mapAttrs(EvalState & state, const PosIdx pos, Value * * args, V
auto attrs = state.buildBindings(args[1]->attrs()->size());
for (auto & i : *args[1]->attrs()) {
Value * vName = state.allocValue();
Value * vName = Value::toPtr(state.symbols[i.name]);
Value * vFun2 = state.allocValue();
vName->mkString(state.symbols[i.name]);
vFun2->mkApp(args[0], vName);
attrs.alloc(i.name).mkApp(vFun2, i.value);
}
@ -3246,8 +3245,7 @@ static void prim_zipAttrsWith(EvalState & state, const PosIdx pos, Value * * arg
auto attrs = state.buildBindings(attrsSeen.size());
for (auto & [sym, elem] : attrsSeen) {
auto name = state.allocValue();
name->mkString(state.symbols[sym]);
auto name = Value::toPtr(state.symbols[sym]);
auto call1 = state.allocValue();
call1->mkApp(args[0], name);
auto call2 = state.allocValue();