From 3a1301cd6db698a212a0c036e40ad402bd8a2a12 Mon Sep 17 00:00:00 2001 From: Sergei Zimmerman Date: Tue, 6 May 2025 21:58:52 +0000 Subject: [PATCH] libstore: Use `boost::regex` for GC root discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As it turns out using `std::regex` is actually the bottleneck for root discovery. Just substituting `std::` -> `boost::` makes root discovery twice as fast (3x if counting only userspace time). Some rather ad-hoc measurements to motivate the switch: (On master) ``` nix build github:nixos/nix/1e822bd4149a8bce1da81ee2ad9404986b07914c#nix-cli --out-link result-1e822bd4149a8bce1da81ee2ad9404986b07914c taskset -c 2,3 hyperfine "result-1e822bd4149a8bce1da81ee2ad9404986b07914c/bin/nix store gc --dry-run --max 0" Benchmark 1: result-1e822bd4149a8bce1da81ee2ad9404986b07914c/bin/nix store gc --dry-run --max 0 Time (mean ± σ): 481.6 ms ± 3.9 ms [User: 336.2 ms, System: 142.0 ms] Range (min … max): 474.6 ms … 487.7 ms 10 runs ``` (After this patch) ``` taskset -c 2,3 hyperfine "result/bin/nix store gc --dry-run --max 0" Benchmark 1: result/bin/nix store gc --dry-run --max 0 Time (mean ± σ): 254.7 ms ± 9.7 ms [User: 111.1 ms, System: 141.3 ms] Range (min … max): 246.5 ms … 281.3 ms 10 runs ``` `boost::regex` is a drop-in replacement for `std::regex`, but much faster. Doing a simple before/after comparison doesn't surface any change in behavior: ``` result/bin/nix store gc --dry-run -vvvvv --max 0 |& grep "got additional" | wc -l result-1e822bd4149a8bce1da81ee2ad9404986b07914c/bin/nix store gc --dry-run -vvvvv --max 0 |& grep "got additional" | wc -l ``` --- src/libstore/gc.cc | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/libstore/gc.cc b/src/libstore/gc.cc index f6a4124ff..e9fe72afe 100644 --- a/src/libstore/gc.cc +++ b/src/libstore/gc.cc @@ -13,10 +13,11 @@ # include "nix/util/processes.hh" #endif +#include + #include #include #include -#include #include #include @@ -331,8 +332,8 @@ static void readProcLink(const std::filesystem::path & file, UncheckedRoots & ro static std::string quoteRegexChars(const std::string & raw) { - static auto specialRegex = std::regex(R"([.^$\\*+?()\[\]{}|])"); - return std::regex_replace(raw, specialRegex, R"(\$&)"); + static auto specialRegex = boost::regex(R"([.^$\\*+?()\[\]{}|])"); + return boost::regex_replace(raw, specialRegex, R"(\$&)"); } #ifdef __linux__ @@ -354,12 +355,12 @@ void LocalStore::findRuntimeRoots(Roots & roots, bool censor) auto procDir = AutoCloseDir{opendir("/proc")}; if (procDir) { struct dirent * ent; - auto digitsRegex = std::regex(R"(^\d+$)"); - auto mapRegex = std::regex(R"(^\s*\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(/\S+)\s*$)"); - auto storePathRegex = std::regex(quoteRegexChars(storeDir) + R"(/[0-9a-z]+[0-9a-zA-Z\+\-\._\?=]*)"); + static const auto digitsRegex = boost::regex(R"(^\d+$)"); + static const auto mapRegex = boost::regex(R"(^\s*\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(/\S+)\s*$)"); + auto storePathRegex = boost::regex(quoteRegexChars(storeDir) + R"(/[0-9a-z]+[0-9a-zA-Z\+\-\._\?=]*)"); while (errno = 0, ent = readdir(procDir.get())) { checkInterrupt(); - if (std::regex_match(ent->d_name, digitsRegex)) { + if (boost::regex_match(ent->d_name, digitsRegex)) { try { readProcLink(fmt("/proc/%s/exe" ,ent->d_name), unchecked); readProcLink(fmt("/proc/%s/cwd", ent->d_name), unchecked); @@ -386,15 +387,15 @@ void LocalStore::findRuntimeRoots(Roots & roots, bool censor) std::filesystem::path mapFile = fmt("/proc/%s/maps", ent->d_name); auto mapLines = tokenizeString>(readFile(mapFile.string()), "\n"); for (const auto & line : mapLines) { - auto match = std::smatch{}; - if (std::regex_match(line, match, mapRegex)) + auto match = boost::smatch{}; + if (boost::regex_match(line, match, mapRegex)) unchecked[match[1]].emplace(mapFile.string()); } auto envFile = fmt("/proc/%s/environ", ent->d_name); auto envString = readFile(envFile); - auto env_end = std::sregex_iterator{}; - for (auto i = std::sregex_iterator{envString.begin(), envString.end(), storePathRegex}; i != env_end; ++i) + auto env_end = boost::sregex_iterator{}; + for (auto i = boost::sregex_iterator{envString.begin(), envString.end(), storePathRegex}; i != env_end; ++i) unchecked[i->str()].emplace(envFile); } catch (SystemError & e) { if (errno == ENOENT || errno == EACCES || errno == ESRCH) @@ -413,12 +414,12 @@ void LocalStore::findRuntimeRoots(Roots & roots, bool censor) // Because of this we disable lsof when running the tests. if (getEnv("_NIX_TEST_NO_LSOF") != "1") { try { - std::regex lsofRegex(R"(^n(/.*)$)"); + boost::regex lsofRegex(R"(^n(/.*)$)"); auto lsofLines = tokenizeString>(runProgram(LSOF, true, { "-n", "-w", "-F", "n" }), "\n"); for (const auto & line : lsofLines) { - std::smatch match; - if (std::regex_match(line, match, lsofRegex)) + boost::smatch match; + if (boost::regex_match(line, match, lsofRegex)) unchecked[match[1].str()].emplace("{lsof}"); } } catch (ExecError & e) {