From 5c29abc5bd2f2bda2342a66e62229054b6ff1a42 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 1 Feb 2023 00:06:46 +0100 Subject: [PATCH] GitArchiveInputScheme: Revert to downloading tarballs Tarballs are now unpacked into a content-addressed cache, specifically a Git repository in ~/.cache/nix/tarball-cache so that we can use GitAccessor to provide random access. --- src/libfetchers/git-accessor.cc | 212 +++++++++++++++++++++++++++--- src/libfetchers/github.cc | 94 ++++++------- src/libfetchers/input-accessor.hh | 4 + src/libfetchers/local.mk | 2 +- 4 files changed, 243 insertions(+), 69 deletions(-) diff --git a/src/libfetchers/git-accessor.cc b/src/libfetchers/git-accessor.cc index dc3fc0d89..9e6fcc021 100644 --- a/src/libfetchers/git-accessor.cc +++ b/src/libfetchers/git-accessor.cc @@ -1,5 +1,7 @@ #include "input-accessor.hh" +#include + #include #include #include @@ -8,6 +10,9 @@ #include #include +#include "tarfile.hh" +#include + namespace nix { template @@ -17,26 +22,35 @@ struct Deleter void operator()(T * p) const { del(p); }; }; +typedef std::unique_ptr> Repository; +typedef std::unique_ptr> TreeEntry; +typedef std::unique_ptr> Tree; +typedef std::unique_ptr> TreeBuilder; +typedef std::unique_ptr> Blob; + +static void initLibGit2() +{ + if (git_libgit2_init() < 0) + throw Error("initialising libgit2: %s", git_error_last()->message); +} + +static Repository openRepo(const CanonPath & path) +{ + initLibGit2(); + git_repository * _repo; + if (git_repository_open(&_repo, path.c_str())) + throw Error("opening Git repository '%s': %s", path, git_error_last()->message); + return Repository(_repo); +} + struct GitInputAccessor : InputAccessor { - typedef std::unique_ptr> Repository; - typedef std::unique_ptr> TreeEntry; - typedef std::unique_ptr> Tree; - typedef std::unique_ptr> Blob; - Repository repo; Tree root; - GitInputAccessor(const CanonPath & path, const Hash & rev) + GitInputAccessor(Repository && repo_, const Hash & rev) + : repo(std::move(repo_)) { - if (git_libgit2_init() < 0) - throw Error("initialising libgit2': %s", path, git_error_last()->message); - - git_repository * _repo; - if (git_repository_open(&_repo, path.c_str())) - throw Error("opening Git repository '%s': %s", path, git_error_last()->message); - repo = Repository(_repo); - git_oid oid; if (git_oid_fromstr(&oid, rev.gitRev().c_str())) throw Error("cannot convert '%s' to a Git OID", rev.gitRev()); @@ -203,8 +217,176 @@ struct GitInputAccessor : InputAccessor ref makeGitInputAccessor(const CanonPath & path, const Hash & rev) { - return make_ref(path, rev); + return make_ref(openRepo(path), rev); } +static Repository openTarballCache() +{ + static CanonPath repoDir(getCacheDir() + "/nix/tarball-cache"); + + initLibGit2(); + + if (pathExists(repoDir.abs())) + return openRepo(repoDir); + else { + git_repository * _repo; + if (git_repository_init(&_repo, repoDir.c_str(), true)) + throw Error("creating Git repository '%s': %s", repoDir, git_error_last()->message); + return Repository(_repo); + } +} + +Hash importTarball(Source & source) +{ + auto repo = openTarballCache(); + + TarArchive archive(source); + + struct PendingDir + { + std::string name; + TreeBuilder builder; + }; + + std::vector pendingDirs; + + auto pushBuilder = [&](std::string name) + { + git_treebuilder * b; + if (git_treebuilder_new(&b, repo.get(), nullptr)) + throw Error("creating a tree builder: %s", git_error_last()->message); + pendingDirs.push_back({ .name = std::move(name), .builder = TreeBuilder(b) }); + }; + + auto popBuilder = [&]() -> std::pair + { + assert(!pendingDirs.empty()); + auto pending = std::move(pendingDirs.back()); + git_oid oid; + if (git_treebuilder_write(&oid, pending.builder.get())) + throw Error("creating a tree object: %s", git_error_last()->message); + pendingDirs.pop_back(); + return {oid, pending.name}; + }; + + auto addToTree = [&](const std::string & name, const git_oid & oid, git_filemode_t mode) + { + assert(!pendingDirs.empty()); + auto & pending = pendingDirs.back(); + if (git_treebuilder_insert(nullptr, pending.builder.get(), name.c_str(), &oid, mode)) + throw Error("adding a file to a tree builder: %s", git_error_last()->message); + }; + + auto updateBuilders = [&](std::span names) + { + // Find the common prefix of pendingDirs and names. + size_t prefixLen = 0; + for (; prefixLen < names.size() && prefixLen + 1 < pendingDirs.size(); ++prefixLen) + if (names[prefixLen] != pendingDirs[prefixLen + 1].name) + break; + + // Finish the builders that are not part of the common prefix. + for (auto n = pendingDirs.size(); n > prefixLen + 1; --n) { + auto [oid, name] = popBuilder(); + addToTree(name, oid, GIT_FILEMODE_TREE); + } + + // Create builders for the new directories. + for (auto n = prefixLen; n < names.size(); ++n) + pushBuilder(names[n]); + + }; + + pushBuilder(""); + + size_t componentsToStrip = 1; + + for (;;) { + // FIXME: merge with extract_archive + struct archive_entry * entry; + int r = archive_read_next_header(archive.archive, &entry); + if (r == ARCHIVE_EOF) break; + auto path = archive_entry_pathname(entry); + if (!path) + throw Error("cannot get archive member name: %s", archive_error_string(archive.archive)); + if (r == ARCHIVE_WARN) + warn(archive_error_string(archive.archive)); + else + archive.check(r); + + auto pathComponents = tokenizeString>(path, "/"); + + std::span pathComponents2{pathComponents}; + + if (pathComponents2.size() <= componentsToStrip) continue; + pathComponents2 = pathComponents2.subspan(componentsToStrip); + + updateBuilders( + archive_entry_filetype(entry) == AE_IFDIR + ? pathComponents2 + : pathComponents2.first(pathComponents2.size() - 1)); + + switch (archive_entry_filetype(entry)) { + + case AE_IFDIR: + // Nothing to do right now. + break; + + case AE_IFREG: { + + git_writestream * stream = nullptr; + if (git_blob_create_from_stream(&stream, repo.get(), nullptr)) + throw Error("creating a blob stream object: %s", git_error_last()->message); + + while (true) { + std::vector buf(128 * 1024); + auto n = archive_read_data(archive.archive, buf.data(), buf.size()); + if (n < 0) + throw Error("cannot read file '%s' from tarball", path); + if (n == 0) break; + if (stream->write(stream, (const char *) buf.data(), n)) + throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message); + } + + git_oid oid; + if (git_blob_create_from_stream_commit(&oid, stream)) + throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message); + + addToTree(*pathComponents.rbegin(), oid, + archive_entry_mode(entry) & S_IXUSR + ? GIT_FILEMODE_BLOB_EXECUTABLE + : GIT_FILEMODE_BLOB); + + break; + } + + case AE_IFLNK: { + auto target = archive_entry_symlink(entry); + + git_oid oid; + if (git_blob_create_from_buffer(&oid, repo.get(), target, strlen(target))) + throw Error("creating a blob object for tarball symlink member '%s': %s", path, git_error_last()->message); + + addToTree(*pathComponents.rbegin(), oid, GIT_FILEMODE_LINK); + + break; + } + + default: + throw Error("file '%s' in tarball has unsupported file type", path); + } + } + + updateBuilders({}); + + auto [oid, _name] = popBuilder(); + + return Hash::parseAny(git_oid_tostr_s(&oid), htSHA1); +} + +ref makeTarballCacheAccessor(const Hash & rev) +{ + return make_ref(openTarballCache(), rev); +} } diff --git a/src/libfetchers/github.cc b/src/libfetchers/github.cc index dd950d526..d3db9fed8 100644 --- a/src/libfetchers/github.cc +++ b/src/libfetchers/github.cc @@ -180,7 +180,7 @@ struct GitArchiveInputScheme : InputScheme virtual DownloadUrl getDownloadUrl(const Input & input) const = 0; - std::pair downloadArchive(ref store, Input input) const + std::pair downloadArchive(ref store, Input input) const { if (!maybeGetStrAttr(input.attrs, "ref")) input.attrs.insert_or_assign("ref", "HEAD"); @@ -190,62 +190,50 @@ struct GitArchiveInputScheme : InputScheme input.attrs.erase("ref"); input.attrs.insert_or_assign("rev", rev->gitRev()); - Attrs lockedAttrs({ - {"type", "git-zipball"}, - {"rev", rev->gitRev()}, - }); - - if (auto res = getCache()->lookup(store, lockedAttrs)) - return {std::move(res->second), std::move(input)}; - - auto url = getDownloadUrl(input); - - auto res = downloadFile(store, url.url, input.getName(), true, url.headers); - - getCache()->add( - store, - lockedAttrs, - { - {"rev", rev->gitRev()}, - }, - res.storePath, - true); - - return {res.storePath, std::move(input)}; - } - - std::pair, Input> getAccessor(ref store, const Input & input) const override - { - auto [storePath, input2] = downloadArchive(store, input); - - auto accessor = makeZipInputAccessor(CanonPath(store->toRealPath(storePath))); - - /* Compute the NAR hash of the contents of the zip file. This - is checked against the NAR hash in the lock file in - Input::checkLocks(). */ - auto key = fmt("zip-nar-hash-%s", store->toRealPath(storePath.to_string())); - auto cache = getCache(); - auto narHash = [&]() { - if (auto narHashS = cache->queryFact(key)) { - return Hash::parseSRI(*narHashS); - } else { - auto narHash = accessor->hashPath(CanonPath::root); - cache->upsertFact(key, narHash.to_string(SRI, true)); - return narHash; - } - }(); + auto treeHashKey = fmt("git-rev-to-tree-hash-%s", rev->gitRev()); - input2.attrs.insert_or_assign("narHash", narHash.to_string(SRI, true)); + if (auto treeHashS = cache->queryFact(treeHashKey)) { + auto treeHash = Hash::parseAny(*treeHashS, htSHA1); + // FIXME: verify that treeHash exists in the tarball cache. + return {std::move(input), treeHash}; + } + /* Stream the tarball into the tarball cache. */ + auto url = getDownloadUrl(input); + + auto source = sinkToSource([&](Sink & sink) { + FileTransferRequest req(url.url); + req.headers = url.headers; + getFileTransfer()->download(std::move(req), sink); + }); + + auto treeHash = importTarball(*source); + + // FIXME: verify against locked tree hash. + input.attrs.insert_or_assign("treeHash", treeHash.gitRev()); + + cache->upsertFact(treeHashKey, treeHash.gitRev()); + + return {std::move(input), treeHash}; + } + + std::pair, Input> getAccessor(ref store, const Input & _input) const override + { + auto [input, treeHash] = downloadArchive(store, _input); + + auto accessor = makeTarballCacheAccessor(treeHash); + + #if 0 auto lastModified = accessor->getLastModified(); assert(lastModified); - input2.attrs.insert_or_assign("lastModified", uint64_t(*lastModified)); + input.attrs.insert_or_assign("lastModified", uint64_t(*lastModified)); + #endif - accessor->setPathDisplay("«" + input2.to_string() + "»"); + accessor->setPathDisplay("«" + input.to_string() + "»"); - return {accessor, input2}; + return {accessor, input}; } bool isLocked(const Input & input) const override @@ -314,10 +302,10 @@ struct GitHubInputScheme : GitArchiveInputScheme // urls so we do not run into rate limits. const auto urlFmt = host != "github.com" - ? "https://%s/api/v3/repos/%s/%s/zipball/%s" + ? "https://%s/api/v3/repos/%s/%s/tarball/%s" : headers.empty() - ? "https://%s/%s/%s/archive/%s.zip" - : "https://api.%s/repos/%s/%s/zipball/%s"; + ? "https://%s/%s/%s/archive/%s.tar.gz" + : "https://api.%s/repos/%s/%s/tarball/%s"; const auto url = fmt(urlFmt, host, getOwner(input), getRepo(input), input.getRev()->to_string(Base16, false)); @@ -384,7 +372,7 @@ struct GitLabInputScheme : GitArchiveInputScheme // is 10 reqs/sec/ip-addr. See // https://docs.gitlab.com/ee/user/gitlab_com/index.html#gitlabcom-specific-rate-limits auto host = maybeGetStrAttr(input.attrs, "host").value_or("gitlab.com"); - auto url = fmt("https://%s/api/v4/projects/%s%%2F%s/repository/archive.zip?sha=%s", + auto url = fmt("https://%s/api/v4/projects/%s%%2F%s/repository/archive.tar.gz?sha=%s", host, getStrAttr(input.attrs, "owner"), getStrAttr(input.attrs, "repo"), input.getRev()->to_string(Base16, false)); diff --git a/src/libfetchers/input-accessor.hh b/src/libfetchers/input-accessor.hh index f107e433f..1289d5515 100644 --- a/src/libfetchers/input-accessor.hh +++ b/src/libfetchers/input-accessor.hh @@ -119,6 +119,10 @@ ref makePatchingInputAccessor( ref makeGitInputAccessor(const CanonPath & path, const Hash & rev); +Hash importTarball(Source & source); + +ref makeTarballCacheAccessor(const Hash & rev); + struct SourcePath { ref accessor; diff --git a/src/libfetchers/local.mk b/src/libfetchers/local.mk index cef74d212..4b27fd443 100644 --- a/src/libfetchers/local.mk +++ b/src/libfetchers/local.mk @@ -8,6 +8,6 @@ libfetchers_SOURCES := $(wildcard $(d)/*.cc) libfetchers_CXXFLAGS += -I src/libutil -I src/libstore -libfetchers_LDFLAGS += -pthread -lzip -lgit2 +libfetchers_LDFLAGS += -pthread -lzip -lgit2 -larchive libfetchers_LIBS = libutil libstore