diff --git a/src/libfetchers/git-lfs-fetch.hh b/src/libfetchers/git-lfs-fetch.hh index 3946dac05..84cc056fa 100644 --- a/src/libfetchers/git-lfs-fetch.hh +++ b/src/libfetchers/git-lfs-fetch.hh @@ -117,7 +117,7 @@ struct Fetch { void init(git_repository* repo, const std::string& gitattributesContent); bool hasAttribute(const std::string& path, const std::string& attrName, const std::string& attrValue) const; - void fetch(const std::string& pointerFileContents, const std::string& pointerFilePath, Sink& sink) const; + void fetch(const git_blob* pointerBlob, const std::string& pointerFilePath, Sink& sink) const; std::vector fetchUrls(const std::vector &metadatas) const; }; @@ -185,8 +185,6 @@ void downloadToSink(const std::string &url, const std::string &authHeader, Sink } - - std::string getLfsApiToken(const GitUrl& u) { const auto [maybeUserAndHost, path] = u.toSsh(); auto [status, output] = runProgram(RunOptions { @@ -216,7 +214,6 @@ std::string getLfsEndpointUrl(git_repository *repo) { return ""; } - const char *url_c_str = git_remote_url(remote); if (!url_c_str) { return ""; @@ -241,35 +238,60 @@ std::string git_attr_value_to_string(git_attr_value_t value) { } -Md parseLfsMetadata(const std::string &content, const std::string &filename) { +std::optional parseLfsMetadata(const std::string &content, const std::string &filename) { + // https://github.com/git-lfs/git-lfs/blob/2ef4108/docs/spec.md + // // example git-lfs pointer file: // version https://git-lfs.github.com/spec/v1 // oid sha256:f5e02aa71e67f41d79023a128ca35bad86cf7b6656967bfe0884b3a3c4325eaf // size 10000000 - std::istringstream iss(content); - std::string line; - std::string oid; - size_t size = 0; + // (ending \n) - while (getline(iss, line)) { - std::size_t pos = line.find("oid sha256:"); - if (pos != std::string::npos) { - oid = line.substr(pos + 11); // skip "oid sha256:" - continue; - } - - pos = line.find("size "); - if (pos != std::string::npos) { - std::string sizeStr = - line.substr(pos + 5); // skip "size " - size = std::stol(sizeStr); - continue; - } + if (!content.starts_with("version ")) { + // Invalid pointer file + return std::nullopt; } - return Md{filename, oid, size}; -} + if (!content.starts_with("version https://git-lfs.github.com/spec/v1")) { + // In case there's new spec versions in the future, but for now only v1 exists + debug("Invalid version found on potential lfs pointer file, skipping"); + return std::nullopt; + } + std::istringstream iss(content); + std::string line; + + std::string oid; + std::string size; + + while (getline(iss, line)) { + if (line.starts_with("version ")) { + continue; + } + if (line.starts_with("oid sha256:")) { + oid = line.substr(11); // skip "oid sha256:" + continue; + } + if (line.starts_with("size ")) { + size = line.substr(5); // skip "size " + continue; + } + + debug("Custom extension '%s' found, ignoring", line); + } + + if (oid.length() != 64 || !std::all_of(oid.begin(), oid.end(), ::isxdigit)) { + debug("Invalid sha256 %s, skipping", oid); + return std::nullopt; + } + + if (size.length() == 0 || !std::all_of(size.begin(), size.end(), ::isdigit)) { + debug("Invalid size %s, skipping", size); + return std::nullopt; + } + + return std::make_optional(Md{filename, oid, std::stoul(size)}); +} // there's already a ParseURL here https://github.com/b-camacho/nix/blob/ef6fa54e05cd4134ec41b0d64c1a16db46237f83/src/libutil/url.cc#L13 @@ -386,7 +408,6 @@ void Fetch::init(git_repository* repo, const std::string& gitattributesContent) } - bool Fetch::hasAttribute(const std::string& path, const std::string& attrName, const std::string& attrValue) const { for (auto it = rules.rbegin(); it != rules.rend(); ++it) { @@ -481,10 +502,32 @@ std::vector Fetch::fetchUrls(const std::vector &metadatas) c } } -void Fetch::fetch(const std::string& pointerFileContents, const std::string& pointerFilePath, Sink& sink) const { +void Fetch::fetch(const git_blob* pointerBlob, const std::string& pointerFilePath, Sink& sink) const { + constexpr size_t chunkSize = 128 * 1024; // 128 KiB + auto size = git_blob_rawsize(pointerBlob); + + if (size >= 1024) { + debug("Skip git-lfs, pointer file too large"); + warn("Encountered a file that should have been a pointer, but wasn't: %s", pointerFilePath); + for (size_t offset = 0; offset < size; offset += chunkSize) { + sink(std::string((const char *) git_blob_rawcontent(pointerBlob) + offset, std::min(chunkSize, size - offset))); + } + return; + } + + const auto pointerFileContents = std::string((const char *) git_blob_rawcontent(pointerBlob), size); const auto md = parseLfsMetadata(std::string(pointerFileContents), std::string(pointerFilePath)); + if (md == std::nullopt) { + debug("Skip git-lfs, invalid pointer file"); + warn("Encountered a file that should have been a pointer, but wasn't: %s", pointerFilePath); + for (size_t offset = 0; offset < size; offset += chunkSize) { + sink(std::string((const char *) git_blob_rawcontent(pointerBlob) + offset, std::min(chunkSize, size - offset))); + } + return; + } + std::vector vMds; - vMds.push_back(md); + vMds.push_back(md.value()); const auto objUrls = fetchUrls(vMds); const auto obj = objUrls[0]; @@ -507,4 +550,3 @@ void Fetch::fetch(const std::string& pointerFileContents, const std::string& poi } // namespace lfs } // namespace nix - diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index dc8086a2c..073e3fcad 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -676,29 +676,31 @@ struct GitSourceAccessor : SourceAccessor , root(peelToTreeOrBlob(lookupObject(*repo, hashToOID(rev)).get())) , lfsFetch(lfsFetch) { + if (lfsFetch && !lookup(CanonPath(".gitattributes"))) { + warn("Requested to fetch lfs files, but no .gitattributes file was found, ignoring"); + } } std::string readBlob(const CanonPath & path, bool symlink) { const auto blob = getBlob(path, symlink); - const auto data = std::string((const char *) git_blob_rawcontent(blob.get()), git_blob_rawsize(blob.get())); - - if (path != CanonPath(".gitattributes") && lfsFetch) { + if (lfsFetch && path != CanonPath(".gitattributes") && lookup(CanonPath(".gitattributes"))) { auto& _lfsFetch = *lfsFetch; if (!_lfsFetch.ready) { const auto contents = readFile(CanonPath(".gitattributes")); _lfsFetch.init(*repo, contents); - }; + } + auto pathStr = std::string(path.rel()); if (_lfsFetch.hasAttribute(pathStr, "filter", "lfs")) { StringSink s; - _lfsFetch.fetch(data, pathStr, s); + _lfsFetch.fetch(blob.get(), pathStr, s); return s.s; } } - return data; + return std::string((const char *) git_blob_rawcontent(blob.get()), git_blob_rawsize(blob.get())); } void readFile( @@ -709,11 +711,7 @@ struct GitSourceAccessor : SourceAccessor auto size = git_blob_rawsize(blob.get()); sizeCallback(size); - // if lfs, this is just a pointer file - // if not lfs then it's not big either way - auto contents = std::string((const char *) git_blob_rawcontent(blob.get()), size); - - if (lfsFetch && path != CanonPath(".gitattributes")) { + if (lfsFetch && path != CanonPath(".gitattributes") && lookup(CanonPath(".gitattributes"))) { auto& _lfsFetch = *lfsFetch; if (!_lfsFetch.ready) { const auto contents = readFile(CanonPath(".gitattributes")); @@ -722,13 +720,15 @@ struct GitSourceAccessor : SourceAccessor auto pathStr = std::string(path.rel()); if (_lfsFetch.hasAttribute(pathStr, "filter", "lfs")) { - _lfsFetch.fetch(contents, pathStr, sink); + _lfsFetch.fetch(blob.get(), pathStr, sink); return; } } - // either not using lfs or file should not be smudged - sink(contents); + constexpr size_t chunkSize = 128 * 1024; // 128 KiB + for (size_t offset = 0; offset < size; offset += chunkSize) { + sink(std::string((const char *) git_blob_rawcontent(blob.get()) + offset, std::min(chunkSize, size - offset))); + } } std::string readFile(const CanonPath & path) override diff --git a/tests/nixos/fetch-git/test-cases/lfs/default.nix b/tests/nixos/fetch-git/test-cases/lfs/default.nix index e414a8018..a339cea91 100644 --- a/tests/nixos/fetch-git/test-cases/lfs/default.nix +++ b/tests/nixos/fetch-git/test-cases/lfs/default.nix @@ -1,72 +1,179 @@ { # mostly copied from https://github.com/NixOS/nix/blob/358c26fd13a902d9a4032a00e6683571be07a384/tests/nixos/fetch-git/test-cases/fetchTree-shallow/default.nix#L1 # ty @DavHau - description = "fetchGit smudges LFS pointers iff lfs=true"; + description = "fetchGit smudges LFS pointers if lfs=true"; script = '' + expected_max_size_lfs_pointer = 1024 # 1 KiB (values >= than this cannot be pointers, and test files are >= 1 MiB) + # purge nix git cache to make sure we start with a clean slate client.succeed("rm -rf ~/.cache/nix") - # add an lfs-enrolled file to the repo: - client.succeed(f"dd if=/dev/urandom of={repo.path}/beeg bs=1M count=1") - client.succeed(f"{repo.git} lfs install") - client.succeed(f"{repo.git} lfs track --filename \"beeg\"") - client.succeed(f"{repo.git} add .gitattributes") - client.succeed(f"{repo.git} add beeg") - client.succeed(f"{repo.git} commit -m 'commit1'") - client.succeed(f"{repo.git} push origin main") - # memoize the revision - commit1_rev = client.succeed(f""" - {repo.git} rev-parse HEAD - """).strip() + # Request lfs fetch without any .gitattributes file + ############################################################################ + + client.succeed(f"dd if=/dev/urandom of={repo.path}/regular bs=1M count=1 >&2") + client.succeed(f"{repo.git} add : >&2") + client.succeed(f"{repo.git} commit -m 'no .gitattributes' >&2") + client.succeed(f"{repo.git} push origin main >&2") + + # memorize the revision + no_gitattributes_rev = client.succeed(f"{repo.git} rev-parse HEAD").strip() + + # fetch with lfs=true, and check that the lack of .gitattributes does not break anything + fetchGit_no_gitattributes_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{no_gitattributes_rev}"; + ref = "main"; + lfs = true; + }} + """ + fetched_no_gitattributes = client.succeed(f""" + nix eval --impure --raw --expr '({fetchGit_no_gitattributes_expr}).outPath' + """) + client.succeed(f"cmp {repo.path}/regular {fetched_no_gitattributes}/regular >&2") + + + # Add a file that should be tracked by lfs, but isn't + # (git lfs cli only throws a warning "Encountered 1 file that should have + # been a pointer, but wasn't") + ############################################################################ + + client.succeed(f"dd if=/dev/urandom of={repo.path}/black_sheep bs=1M count=1 >&2") + client.succeed(f"echo 'black_sheep filter=lfs -text' >>{repo.path}/.gitattributes") + client.succeed(f"{repo.git} add : >&2") + client.succeed(f"{repo.git} commit -m 'add misleading file' >&2") + client.succeed(f"{repo.git} push origin main >&2") + + # memorize the revision + bad_lfs_rev = client.succeed(f"{repo.git} rev-parse HEAD").strip() + + # prove that it can be cloned with regular git first + # (here we see the warning as stated above) + from tempfile import TemporaryDirectory + with TemporaryDirectory() as tempdir: + client.succeed(f"git clone -n {repo.remote} {tempdir} >&2") + client.succeed(f"git -C {tempdir} lfs install >&2") + client.succeed(f"git -C {tempdir} checkout {bad_lfs_rev} >&2") + + # check that the file is not a pointer, as expected + file_size_git = client.succeed(f"stat -c %s {tempdir}/black_sheep").strip() + assert int(file_size_git) >= expected_max_size_lfs_pointer, \ + f"non lfs file is {file_size_git}b (<{expected_max_size_lfs_pointer}b), probably a test implementation error" + + lfs_files = client.succeed(f"git -C {tempdir} lfs ls-files").strip() + assert lfs_files == "", "non lfs file is tracked by lfs, probably a test implementation error" + + client.succeed(f"cmp {repo.path}/black_sheep {tempdir}/black_sheep >&2") + + # now fetch without lfs, check that the file is not a pointer + fetchGit_bad_lfs_without_lfs_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{bad_lfs_rev}"; + ref = "main"; + lfs = false; + }} + """ + fetched_bad_lfs_without_lfs = client.succeed(f""" + nix eval --impure --raw --expr '({fetchGit_bad_lfs_without_lfs_expr}).outPath' + """) + + # check that file was not somehow turned into a pointer + file_size_bad_lfs_without_lfs = client.succeed(f"stat -c %s {fetched_bad_lfs_without_lfs}/black_sheep").strip() + + assert int(file_size_bad_lfs_without_lfs) >= expected_max_size_lfs_pointer, \ + f"non lfs-enrolled file is {file_size_bad_lfs_without_lfs}b (<{expected_max_size_lfs_pointer}b), probably a test implementation error" + client.succeed(f"cmp {repo.path}/black_sheep {fetched_bad_lfs_without_lfs}/black_sheep >&2") + + # finally fetch with lfs=true, and check that the bad file does not break anything + fetchGit_bad_lfs_with_lfs_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{bad_lfs_rev}"; + ref = "main"; + lfs = true; + }} + """ + fetchGit_bad_lfs_with_lfs = client.succeed(f""" + nix eval --impure --raw --expr '({fetchGit_bad_lfs_with_lfs_expr}).outPath' + """) + + client.succeed(f"cmp {repo.path}/black_sheep {fetchGit_bad_lfs_with_lfs}/black_sheep >&2") + + + # Add an lfs-enrolled file to the repo + ############################################################################ + + client.succeed(f"dd if=/dev/urandom of={repo.path}/beeg bs=1M count=1 >&2") + client.succeed(f"{repo.git} lfs install >&2") + client.succeed(f"{repo.git} lfs track --filename beeg >&2") + client.succeed(f"{repo.git} add : >&2") + client.succeed(f"{repo.git} commit -m 'add lfs file' >&2") + client.succeed(f"{repo.git} push origin main >&2") + + # memorize the revision + lfs_file_rev = client.succeed(f"{repo.git} rev-parse HEAD").strip() # first fetch without lfs, check that we did not smudge the file fetchGit_nolfs_expr = f""" builtins.fetchGit {{ url = "{repo.remote}"; - rev = "{commit1_rev}"; + rev = "{lfs_file_rev}"; ref = "main"; lfs = false; }} """ - - # fetch the repo via nix fetched_nolfs = client.succeed(f""" nix eval --impure --raw --expr '({fetchGit_nolfs_expr}).outPath' """) # check that file was not smudged - file_size_nolfs = client.succeed(f""" - stat -c %s {fetched_nolfs}/beeg - """).strip() + file_size_nolfs = client.succeed(f"stat -c %s {fetched_nolfs}/beeg").strip() - expected_max_size_lfs = 1024 - assert int(file_size_nolfs) < expected_max_size_lfs, f"did not set lfs=true, yet lfs-enrolled file is {file_size_nolfs}b (>{expected_max_size_lfs}b), probably smudged when we should not have" + assert int(file_size_nolfs) < expected_max_size_lfs_pointer, \ + f"did not set lfs=true, yet lfs-enrolled file is {file_size_nolfs}b (>{expected_max_size_lfs_pointer}b), probably smudged when we should not have" # now fetch with lfs=true and check that the file was smudged fetchGit_lfs_expr = f""" builtins.fetchGit {{ url = "{repo.remote}"; - rev = "{commit1_rev}"; + rev = "{lfs_file_rev}"; ref = "main"; lfs = true; }} """ - - - # fetch the repo via nix fetched_lfs = client.succeed(f""" nix eval --impure --raw --expr '({fetchGit_lfs_expr}).outPath' """) - assert fetched_lfs != fetched_nolfs, f"fetching with and without lfs yielded the same store path {fetched_lfs}, fingerprinting error?" + assert fetched_lfs != fetched_nolfs, \ + f"fetching with and without lfs yielded the same store path {fetched_lfs}, fingerprinting error?" # check that file was smudged - file_size_lfs = client.succeed(f""" - stat -c %s {fetched_lfs}/beeg - """).strip() + file_size_lfs = client.succeed(f"stat -c %s {fetched_lfs}/beeg").strip() + assert int(file_size_lfs) >= expected_max_size_lfs_pointer, \ + f"set lfs=true, yet lfs-enrolled file is {file_size_lfs}b (<{expected_max_size_lfs_pointer}), probably did not smudge when we should have" - expected_min_size_lfs = 1024 * 1024 # 1MB - assert int(file_size_lfs) >= expected_min_size_lfs, f"set lfs=true, yet lfs-enrolled file is {file_size_lfs}b (<{expected_min_size_lfs}), probably did not smudge when we should have" + + # Check that default is lfs=false + ############################################################################ + fetchGit_default_expr = f""" + builtins.fetchGit {{ + url = "{repo.remote}"; + rev = "{lfs_file_rev}"; + ref = "main"; + }} + """ + fetched_default = client.succeed(f""" + nix eval --impure --raw --expr '({fetchGit_default_expr}).outPath' + """) + + # check that file was not smudged + file_size_default = client.succeed(f"stat -c %s {fetched_default}/beeg").strip() + + assert int(file_size_default) < expected_max_size_lfs_pointer, \ + f"did not set lfs, yet lfs-enrolled file is {file_size_default}b (>{expected_max_size_lfs_pointer}b), probably bad default value" ''; }