1
0
Fork 0
mirror of https://github.com/NixOS/nix synced 2025-06-26 07:31:15 +02:00

Add a Git-based content-addressed tarball cache

GitArchiveInputScheme now streams tarballs into a Git repository. This
deduplicates data a lot, e.g. when you're fetching different revisions
of the Nixpkgs repo. It also warns if the tree hash returned by GitHub
doesn't match the tree hash of the imported tarball.
This commit is contained in:
Eelco Dolstra 2023-11-29 12:35:08 +01:00
parent a8fea5a54f
commit b36857ac8d
5 changed files with 272 additions and 37 deletions

View file

@ -4,6 +4,7 @@
#include "finally.hh"
#include "processes.hh"
#include "signals.hh"
#include "users.hh"
#include <boost/core/span.hpp>
@ -21,6 +22,9 @@
#include <git2/submodule.h>
#include <git2/tree.h>
#include "tarfile.hh"
#include <archive_entry.h>
#include <unordered_set>
#include <queue>
#include <regex>
@ -307,6 +311,158 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
return std::nullopt;
}
TarballInfo importTarball(Source & source) override
{
TarArchive archive(source);
struct PendingDir
{
std::string name;
TreeBuilder builder;
};
std::vector<PendingDir> pendingDirs;
auto pushBuilder = [&](std::string name)
{
git_treebuilder * b;
if (git_treebuilder_new(&b, *this, nullptr))
throw Error("creating a tree builder: %s", git_error_last()->message);
pendingDirs.push_back({ .name = std::move(name), .builder = TreeBuilder(b) });
};
auto popBuilder = [&]() -> std::pair<git_oid, std::string>
{
assert(!pendingDirs.empty());
auto pending = std::move(pendingDirs.back());
git_oid oid;
if (git_treebuilder_write(&oid, pending.builder.get()))
throw Error("creating a tree object: %s", git_error_last()->message);
pendingDirs.pop_back();
return {oid, pending.name};
};
auto addToTree = [&](const std::string & name, const git_oid & oid, git_filemode_t mode)
{
assert(!pendingDirs.empty());
auto & pending = pendingDirs.back();
if (git_treebuilder_insert(nullptr, pending.builder.get(), name.c_str(), &oid, mode))
throw Error("adding a file to a tree builder: %s", git_error_last()->message);
};
auto updateBuilders = [&](boost::span<const std::string> names)
{
// Find the common prefix of pendingDirs and names.
size_t prefixLen = 0;
for (; prefixLen < names.size() && prefixLen + 1 < pendingDirs.size(); ++prefixLen)
if (names[prefixLen] != pendingDirs[prefixLen + 1].name)
break;
// Finish the builders that are not part of the common prefix.
for (auto n = pendingDirs.size(); n > prefixLen + 1; --n) {
auto [oid, name] = popBuilder();
addToTree(name, oid, GIT_FILEMODE_TREE);
}
// Create builders for the new directories.
for (auto n = prefixLen; n < names.size(); ++n)
pushBuilder(names[n]);
};
pushBuilder("");
size_t componentsToStrip = 1;
time_t lastModified = 0;
for (;;) {
// FIXME: merge with extract_archive
struct archive_entry * entry;
int r = archive_read_next_header(archive.archive, &entry);
if (r == ARCHIVE_EOF) break;
auto path = archive_entry_pathname(entry);
if (!path)
throw Error("cannot get archive member name: %s", archive_error_string(archive.archive));
if (r == ARCHIVE_WARN)
warn(archive_error_string(archive.archive));
else
archive.check(r);
lastModified = std::max(lastModified, archive_entry_mtime(entry));
auto pathComponents = tokenizeString<std::vector<std::string>>(path, "/");
boost::span<const std::string> pathComponents2{pathComponents};
if (pathComponents2.size() <= componentsToStrip) continue;
pathComponents2 = pathComponents2.subspan(componentsToStrip);
updateBuilders(
archive_entry_filetype(entry) == AE_IFDIR
? pathComponents2
: pathComponents2.first(pathComponents2.size() - 1));
switch (archive_entry_filetype(entry)) {
case AE_IFDIR:
// Nothing to do right now.
break;
case AE_IFREG: {
git_writestream * stream = nullptr;
if (git_blob_create_from_stream(&stream, *this, nullptr))
throw Error("creating a blob stream object: %s", git_error_last()->message);
while (true) {
std::vector<unsigned char> buf(128 * 1024);
auto n = archive_read_data(archive.archive, buf.data(), buf.size());
if (n < 0)
throw Error("cannot read file '%s' from tarball", path);
if (n == 0) break;
if (stream->write(stream, (const char *) buf.data(), n))
throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message);
}
git_oid oid;
if (git_blob_create_from_stream_commit(&oid, stream))
throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message);
addToTree(*pathComponents.rbegin(), oid,
archive_entry_mode(entry) & S_IXUSR
? GIT_FILEMODE_BLOB_EXECUTABLE
: GIT_FILEMODE_BLOB);
break;
}
case AE_IFLNK: {
auto target = archive_entry_symlink(entry);
git_oid oid;
if (git_blob_create_from_buffer(&oid, *this, target, strlen(target)))
throw Error("creating a blob object for tarball symlink member '%s': %s", path, git_error_last()->message);
addToTree(*pathComponents.rbegin(), oid, GIT_FILEMODE_LINK);
break;
}
default:
throw Error("file '%s' in tarball has unsupported file type", path);
}
}
updateBuilders({});
auto [oid, _name] = popBuilder();
return TarballInfo {
.treeHash = toHash(oid),
.lastModified = lastModified
};
}
std::vector<std::tuple<Submodule, Hash>> getSubmodules(const Hash & rev) override;
std::string resolveSubmoduleUrl(
@ -449,6 +605,22 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this<GitRepoImpl>
else
throw Error("Commit signature verification on commit %s failed: %s", rev.gitRev(), output);
}
Hash treeHashToNarHash(const Hash & treeHash) override
{
auto accessor = getAccessor(treeHash);
fetchers::Attrs cacheKey({{"_what", "treeHashToNarHash"}, {"treeHash", treeHash.gitRev()}});
if (auto res = fetchers::getCache()->lookup(cacheKey))
return Hash::parseAny(fetchers::getStrAttr(*res, "narHash"), htSHA256);
auto narHash = accessor->hashPath(CanonPath::root);
fetchers::getCache()->upsert(cacheKey, fetchers::Attrs({{"narHash", narHash.to_string(HashFormat::SRI, true)}}));
return narHash;
}
};
ref<GitRepo> GitRepo::openRepo(const CanonPath & path, bool create, bool bare)
@ -673,5 +845,11 @@ std::vector<std::tuple<GitRepoImpl::Submodule, Hash>> GitRepoImpl::getSubmodules
return result;
}
ref<GitRepo> getTarballCache()
{
static CanonPath repoDir(getCacheDir() + "/nix/tarball-cache");
return make_ref<GitRepoImpl>(repoDir, true, true);
}
}