You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@buildstream.apache.org by gi...@apache.org on 2020/12/29 13:15:05 UTC

[buildstream] 01/01: Fetch git shallow clone when possible

This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch valentindavid/git_shallow_fetch
in repository https://gitbox.apache.org/repos/asf/buildstream.git

commit 6317deb0612e22608e6b366504cb0ca19ddd92d1
Author: Valentin David <va...@codethink.co.uk>
AuthorDate: Wed Dec 5 11:47:28 2018 +0100

    Fetch git shallow clone when possible
    
    When the requested ref is advertised by remote and that no tag is
    required, then we shallow clone that requested ref. Otherwise we
    fallback on full clone.
    
    Workspace opening and tracking opeerations still get a full clone.
    
    Fixes #261
---
 buildstream/_gitsourcebase.py | 145 +++++++++++++++++++++----
 tests/sources/git.py          | 247 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 372 insertions(+), 20 deletions(-)

diff --git a/buildstream/_gitsourcebase.py b/buildstream/_gitsourcebase.py
index 7d07c56..f42431a 100644
--- a/buildstream/_gitsourcebase.py
+++ b/buildstream/_gitsourcebase.py
@@ -49,7 +49,7 @@ WARN_INVALID_SUBMODULE = "invalid-submodule"
 #
 class _GitMirror(SourceFetcher):
 
-    def __init__(self, source, path, url, ref, *, primary=False, tags=[]):
+    def __init__(self, source, path, url, ref, *, primary=False, tags=[], tracking=None):
 
         super().__init__()
         self.source = source
@@ -58,11 +58,101 @@ class _GitMirror(SourceFetcher):
         self.ref = ref
         self.tags = tags
         self.primary = primary
+        dirname = utils.url_directory_name(url)
         self.mirror = os.path.join(source.get_mirror_directory(), utils.url_directory_name(url))
+        self.fetch_mirror = os.path.join(source.get_mirror_directory(), '{}-{}'.format(dirname, ref))
         self.mark_download_url(url)
+        self.tracking = tracking
+
+    def mirror_path(self):
+        if os.path.exists(self.mirror):
+            return self.mirror
+        else:
+            assert os.path.exists(self.fetch_mirror)
+            return self.fetch_mirror
+
+    def ensure_fetchable(self, alias_override=None):
+
+        if os.path.exists(self.mirror):
+            return
+
+        if self.tags:
+            for tag, commit, _ in self.tags:
+                if commit != self.ref:
+                    self.source.status("{}: tag '{}' is not on commit '{}', so a full clone is required"
+                                       .format(self.source, tag, commit))
+                    self.ensure_trackable(alias_override=alias_override)
+                    return
+
+        if os.path.exists(self.fetch_mirror):
+            return
+
+        with self.source.tempdir() as tmpdir:
+            self.source.call([self.source.host_git, 'init', '--bare', tmpdir],
+                             fail="Failed to init git repository",
+                             fail_temporarily=True)
+
+            url = self.source.translate_url(self.url, alias_override=alias_override,
+                                            primary=self.primary)
+
+            self.source.call([self.source.host_git, 'remote', 'add', '--mirror=fetch', 'origin', url],
+                             cwd=tmpdir,
+                             fail="Failed to init git repository",
+                             fail_temporarily=True)
+
+            _, refs = self.source.check_output([self.source.host_git, 'ls-remote', 'origin'],
+                                               cwd=tmpdir,
+                                               fail="Failed to clone git repository {}".format(url),
+                                               fail_temporarily=True)
+
+            advertised = None
+            for ref_line in refs.splitlines():
+                commit, ref = ref_line.split('\t', 1)
+                if ref == 'HEAD':
+                    continue
+                if self.tracking:
+                    # For validate_cache to work
+                    if ref not in ['refs/heads/{}'.format(self.tracking),
+                                   'refs/tags/{}'.format(self.tracking),
+                                   'refs/tags/{}{}'.format(self.tracking, '^{}')]:
+                        continue
+                if self.ref == commit:
+                    if ref.endswith('^{}'):
+                        ref = ref[:-3]
+                    advertised = ref
+                    break
+
+            if advertised is None:
+                self.source.status("{}: {} is not advertised on {}, so a full clone is required"
+                                   .format(self.source, self.ref, url))
+
+                self.ensure_trackable(alias_override=alias_override)
+                return
+
+            self.source.call([self.source.host_git, 'fetch', '--depth=1', 'origin', advertised],
+                             cwd=tmpdir,
+                             fail="Failed to fetch repository",
+                             fail_temporarily=True)
+
+            # We need to have a ref to make it clonable
+            self.source.call([self.source.host_git, 'update-ref', 'HEAD', self.ref],
+                             cwd=tmpdir,
+                             fail="Failed to tag HEAD",
+                             fail_temporarily=True)
+
+            try:
+                move_atomic(tmpdir, self.fetch_mirror)
+            except DirectoryExistsError:
+                # Another process was quicker to download this repository.
+                # Let's discard our own
+                self.source.status("{}: Discarding duplicate clone of {}"
+                                   .format(self.source, url))
+            except OSError as e:
+                raise SourceError("{}: Failed to move cloned git repository {} from '{}' to '{}': {}"
+                                  .format(self.source, url, tmpdir, self.fetch_mirror, e)) from e
 
     # Ensures that the mirror exists
-    def ensure(self, alias_override=None):
+    def ensure_trackable(self, alias_override=None):
 
         # Unfortunately, git does not know how to only clone just a specific ref,
         # so we have to download all of those gigs even if we only need a couple
@@ -97,18 +187,20 @@ class _GitMirror(SourceFetcher):
                                         alias_override=alias_override,
                                         primary=self.primary)
 
+        mirror = self.mirror_path()
+
         if alias_override:
             remote_name = utils.url_directory_name(alias_override)
             _, remotes = self.source.check_output(
                 [self.source.host_git, 'remote'],
-                fail="Failed to retrieve list of remotes in {}".format(self.mirror),
-                cwd=self.mirror
+                fail="Failed to retrieve list of remotes in {}".format(mirror),
+                cwd=mirror
             )
             if remote_name not in remotes:
                 self.source.call(
                     [self.source.host_git, 'remote', 'add', remote_name, url],
                     fail="Failed to add remote {} with url {}".format(remote_name, url),
-                    cwd=self.mirror
+                    cwd=mirror
                 )
         else:
             remote_name = "origin"
@@ -117,7 +209,7 @@ class _GitMirror(SourceFetcher):
                           '+refs/heads/*:refs/heads/*', '+refs/tags/*:refs/tags/*'],
                          fail="Failed to fetch from remote git repository: {}".format(url),
                          fail_temporarily=True,
-                         cwd=self.mirror)
+                         cwd=mirror)
 
     def fetch(self, alias_override=None):
         # Resolve the URL for the message
@@ -128,7 +220,7 @@ class _GitMirror(SourceFetcher):
         with self.source.timed_activity("Fetching from {}"
                                         .format(resolved_url),
                                         silent_nested=True):
-            self.ensure(alias_override)
+            self.ensure_fetchable(alias_override)
             if not self.has_ref():
                 self._fetch(alias_override)
             self.assert_ref()
@@ -137,12 +229,14 @@ class _GitMirror(SourceFetcher):
         if not self.ref:
             return False
 
-        # If the mirror doesnt exist, we also dont have the ref
-        if not os.path.exists(self.mirror):
+        if not os.path.exists(self.mirror) and not os.path.exists(self.fetch_mirror):
+            # If the mirror doesnt exist, we also dont have the ref
             return False
 
+        mirror = self.mirror_path()
+
         # Check if the ref is really there
-        rc = self.source.call([self.source.host_git, 'cat-file', '-t', self.ref], cwd=self.mirror)
+        rc = self.source.call([self.source.host_git, 'cat-file', '-t', self.ref], cwd=mirror)
         return rc == 0
 
     def assert_ref(self):
@@ -192,11 +286,13 @@ class _GitMirror(SourceFetcher):
     def stage(self, directory):
         fullpath = os.path.join(directory, self.path)
 
+        mirror = self.mirror_path()
+
         # Using --shared here avoids copying the objects into the checkout, in any
         # case we're just checking out a specific commit and then removing the .git/
         # directory.
-        self.source.call([self.source.host_git, 'clone', '--no-checkout', '--shared', self.mirror, fullpath],
-                         fail="Failed to create git mirror {} in directory: {}".format(self.mirror, fullpath),
+        self.source.call([self.source.host_git, 'clone', '--no-checkout', '--shared', mirror, fullpath],
+                         fail="Failed to create git mirror {} in directory: {}".format(mirror, fullpath),
                          fail_temporarily=True)
 
         self.source.call([self.source.host_git, 'checkout', '--force', self.ref],
@@ -226,9 +322,11 @@ class _GitMirror(SourceFetcher):
 
     # List the submodules (path/url tuples) present at the given ref of this repo
     def submodule_list(self):
+        mirror = self.mirror_path()
+
         modules = "{}:{}".format(self.ref, GIT_MODULES)
         exit_code, output = self.source.check_output(
-            [self.source.host_git, 'show', modules], cwd=self.mirror)
+            [self.source.host_git, 'show', modules], cwd=mirror)
 
         # If git show reports error code 128 here, we take it to mean there is
         # no .gitmodules file to display for the given revision.
@@ -256,6 +354,8 @@ class _GitMirror(SourceFetcher):
     # Fetch the ref which this mirror requires its submodule to have,
     # at the given ref of this mirror.
     def submodule_ref(self, submodule, ref=None):
+        mirror = self.mirror_path()
+
         if not ref:
             ref = self.ref
 
@@ -264,7 +364,7 @@ class _GitMirror(SourceFetcher):
         _, output = self.source.check_output([self.source.host_git, 'ls-tree', ref, submodule],
                                              fail="ls-tree failed for commit {} and submodule: {}".format(
                                                  ref, submodule),
-                                             cwd=self.mirror)
+                                             cwd=mirror)
 
         # read the commit hash from the output
         fields = output.split()
@@ -392,8 +492,11 @@ class _GitSourceBase(Source):
         self.track_tags = self.node_get_member(node, bool, 'track-tags', False)
 
         self.original_url = self.node_get_member(node, str, 'url')
-        self.mirror = self.BST_MIRROR_CLASS(self, '', self.original_url, ref, tags=tags, primary=True)
         self.tracking = self.node_get_member(node, str, 'track', None)
+        self.mirror = self.BST_MIRROR_CLASS(self, '', self.original_url, ref,
+                                            tags=tags,
+                                            primary=True,
+                                            tracking=self.tracking)
 
         self.ref_format = self.node_get_member(node, str, 'ref-format', 'sha1')
         if self.ref_format not in ['sha1', 'git-describe']:
@@ -511,7 +614,7 @@ class _GitSourceBase(Source):
         with self.timed_activity("Tracking {} from {}"
                                  .format(self.tracking, resolved_url),
                                  silent_nested=True):
-            self.mirror.ensure()
+            self.mirror.ensure_trackable()
             self.mirror._fetch()
 
             # Update self.mirror.ref and node.ref from the self.tracking branch
@@ -521,6 +624,7 @@ class _GitSourceBase(Source):
 
     def init_workspace(self, directory):
         # XXX: may wish to refactor this as some code dupe with stage()
+        self.mirror.ensure_trackable()
         self._refresh_submodules()
 
         with self.timed_activity('Setting up workspace "{}"'.format(directory), silent_nested=True):
@@ -595,15 +699,16 @@ class _GitSourceBase(Source):
         # Assert that the ref exists in the track tag/branch, if track has been specified.
         ref_in_track = False
         if self.tracking:
+            mirror = self.mirror.mirror_path()
             _, branch = self.check_output([self.host_git, 'branch', '--list', self.tracking,
                                            '--contains', self.mirror.ref],
-                                          cwd=self.mirror.mirror)
+                                          cwd=mirror)
             if branch:
                 ref_in_track = True
             else:
                 _, tag = self.check_output([self.host_git, 'tag', '--list', self.tracking,
                                             '--contains', self.mirror.ref],
-                                           cwd=self.mirror.mirror)
+                                           cwd=mirror)
                 if tag:
                     ref_in_track = True
 
@@ -628,7 +733,7 @@ class _GitSourceBase(Source):
 
         self._refresh_submodules()
         for mirror in self.submodules:
-            if not os.path.exists(mirror.mirror):
+            if not os.path.exists(mirror.mirror) and not os.path.exists(mirror.fetch_mirror):
                 return False
             if not mirror.has_ref():
                 return False
@@ -640,7 +745,7 @@ class _GitSourceBase(Source):
     # Assumes that we have our mirror and we have the ref which we point to
     #
     def _refresh_submodules(self):
-        self.mirror.ensure()
+        self.mirror.ensure_fetchable()
         submodules = []
 
         for path, url in self.mirror.submodule_list():
diff --git a/tests/sources/git.py b/tests/sources/git.py
index b7b175e..43cf95e 100644
--- a/tests/sources/git.py
+++ b/tests/sources/git.py
@@ -34,6 +34,7 @@ from buildstream import _yaml
 from buildstream.plugin import CoreWarnings
 from buildstream.testing import cli  # pylint: disable=unused-import
 from buildstream.testing import create_repo
+from buildstream.utils import url_directory_name
 
 from tests.testutils.site import HAVE_GIT, HAVE_OLD_GIT
 
@@ -1225,3 +1226,249 @@ def test_overwrite_rogue_tag_multiple_remotes(cli, tmpdir, datafiles):
 
     result = cli.run(project=project, args=['build', 'target.bst'])
     result.assert_success()
+
+
+@pytest.mark.skipif(HAVE_GIT is False, reason="git is not available")
+@pytest.mark.datafiles(os.path.join(DATA_DIR, 'template'))
+def test_fetch_shallow(cli, tmpdir, datafiles):
+    project = str(datafiles)
+
+    repo = create_repo('git', str(tmpdir))
+    previous_ref = repo.create(os.path.join(project, 'repofiles'))
+
+    file1 = os.path.join(str(tmpdir), 'file1')
+    with open(file1, 'w') as f:
+        f.write('test\n')
+    ref = repo.add_file(file1)
+
+    source_config = repo.source_config(ref=ref)
+
+    # Write out our test target with a bad ref
+    element = {
+        'kind': 'import',
+        'sources': [
+            source_config
+        ]
+    }
+    _yaml.dump(element, os.path.join(project, 'target.bst'))
+
+    sources_dir = os.path.join(str(tmpdir), 'sources')
+    os.makedirs(sources_dir, exist_ok=True)
+    config = {
+        'sourcedir': sources_dir
+    }
+    cli.configure(config)
+
+    result = cli.run(project=project, args=[
+        'source', 'fetch', 'target.bst'
+    ])
+    result.assert_success()
+
+    cache_dir_name = url_directory_name(source_config['url'])
+    full_cache_path = os.path.join(sources_dir, 'git', cache_dir_name)
+    shallow_cache_path = os.path.join(sources_dir, 'git', '{}-{}'.format(cache_dir_name, ref))
+
+    assert os.path.exists(shallow_cache_path)
+    assert not os.path.exists(full_cache_path)
+
+    output = subprocess.run(['git', 'log', '--format=format:%H'],
+                            cwd=shallow_cache_path,
+                            stdout=subprocess.PIPE).stdout.decode('ascii')
+    assert output.splitlines() == [ref]
+
+    result = cli.run(project=project, args=[
+        'build', 'target.bst'
+    ])
+    result.assert_success()
+
+    output = subprocess.run(['git', 'log', '--format=format:%H'],
+                            cwd=shallow_cache_path,
+                            stdout=subprocess.PIPE).stdout.decode('ascii')
+    assert output.splitlines() == [ref]
+
+    assert os.path.exists(shallow_cache_path)
+    assert not os.path.exists(full_cache_path)
+
+    result = cli.run(project=project, args=[
+        'source', 'track', 'target.bst'
+    ])
+    result.assert_success()
+
+    assert os.path.exists(full_cache_path)
+    output = subprocess.run(['git', 'log', '--format=format:%H'],
+                            cwd=full_cache_path,
+                            stdout=subprocess.PIPE).stdout.decode('ascii')
+    assert output.splitlines() == [ref, previous_ref]
+
+
+@pytest.mark.skipif(HAVE_GIT is False, reason="git is not available")
+@pytest.mark.datafiles(os.path.join(DATA_DIR, 'template'))
+def test_fetch_shallow_not_tagged(cli, tmpdir, datafiles):
+    """When a ref is not tagged and not head of branch on remote we cannot
+    get a shallow clone.  It should automatically get a full clone.
+    """
+
+    project = str(datafiles)
+
+    repo = create_repo('git', str(tmpdir))
+    previous_ref = repo.create(os.path.join(project, 'repofiles'))
+
+    file1 = os.path.join(str(tmpdir), 'file1')
+    with open(file1, 'w') as f:
+        f.write('test\n')
+    ref = repo.add_file(file1)
+
+    source_config = repo.source_config(ref=previous_ref)
+
+    # Write out our test target with a bad ref
+    element = {
+        'kind': 'import',
+        'sources': [
+            source_config
+        ]
+    }
+    _yaml.dump(element, os.path.join(project, 'target.bst'))
+
+    sources_dir = os.path.join(str(tmpdir), 'sources')
+    os.makedirs(sources_dir, exist_ok=True)
+    config = {
+        'sourcedir': sources_dir
+    }
+    cli.configure(config)
+
+    result = cli.run(project=project, args=[
+        'source', 'fetch', 'target.bst'
+    ])
+    result.assert_success()
+
+    cache_dir_name = url_directory_name(source_config['url'])
+    full_cache_path = os.path.join(sources_dir, 'git', cache_dir_name)
+    shallow_cache_path = os.path.join(sources_dir, 'git', '{}-{}'.format(cache_dir_name, previous_ref))
+
+    assert not os.path.exists(shallow_cache_path)
+    assert os.path.exists(full_cache_path)
+
+    output = subprocess.run(['git', 'log', '--format=format:%H'],
+                            cwd=full_cache_path,
+                            stdout=subprocess.PIPE).stdout.decode('ascii')
+    assert output.splitlines() == [ref, previous_ref]
+
+
+@pytest.mark.skipif(HAVE_GIT is False, reason="git is not available")
+@pytest.mark.datafiles(os.path.join(DATA_DIR, 'template'))
+def test_fetch_shallow_annotated_tag(cli, tmpdir, datafiles):
+    """When a ref is not tagged and not head of branch on remote we cannot
+    get a shallow clone.  It should automatically get a full clone.
+    """
+
+    project = str(datafiles)
+
+    repo = create_repo('git', str(tmpdir))
+    previous_ref = repo.create(os.path.join(project, 'repofiles'))
+
+    repo.add_annotated_tag('tag', 'tag')
+
+    file1 = os.path.join(str(tmpdir), 'file1')
+    with open(file1, 'w') as f:
+        f.write('test\n')
+    repo.add_file(file1)
+
+    source_config = repo.source_config(ref=previous_ref)
+    del source_config['track']
+
+    # Write out our test target with a bad ref
+    element = {
+        'kind': 'import',
+        'sources': [
+            source_config
+        ]
+    }
+    _yaml.dump(element, os.path.join(project, 'target.bst'))
+
+    sources_dir = os.path.join(str(tmpdir), 'sources')
+    os.makedirs(sources_dir, exist_ok=True)
+    config = {
+        'sourcedir': sources_dir
+    }
+    cli.configure(config)
+
+    result = cli.run(project=project, args=[
+        'source', 'fetch', 'target.bst'
+    ])
+    result.assert_success()
+
+    cache_dir_name = url_directory_name(source_config['url'])
+    full_cache_path = os.path.join(sources_dir, 'git', cache_dir_name)
+    shallow_cache_path = os.path.join(sources_dir, 'git', '{}-{}'.format(cache_dir_name, previous_ref))
+
+    assert os.path.exists(shallow_cache_path)
+    assert not os.path.exists(full_cache_path)
+
+    output = subprocess.run(['git', 'log', '--format=format:%H'],
+                            cwd=shallow_cache_path,
+                            stdout=subprocess.PIPE).stdout.decode('ascii')
+    assert output.splitlines() == [previous_ref]
+
+
+@pytest.mark.skipif(HAVE_GIT is False, reason="git is not available")
+@pytest.mark.datafiles(os.path.join(DATA_DIR, 'template'))
+def test_fetch_shallow_workspace_open(cli, tmpdir, datafiles):
+    """
+    Workspaces should get a full clone.
+    """
+    project = str(datafiles)
+
+    repo = create_repo('git', str(tmpdir))
+    previous_ref = repo.create(os.path.join(project, 'repofiles'))
+
+    file1 = os.path.join(str(tmpdir), 'file1')
+    with open(file1, 'w') as f:
+        f.write('test\n')
+    ref = repo.add_file(file1)
+
+    source_config = repo.source_config(ref=ref)
+
+    # Write out our test target with a bad ref
+    element = {
+        'kind': 'import',
+        'sources': [
+            source_config
+        ]
+    }
+    _yaml.dump(element, os.path.join(project, 'target.bst'))
+
+    sources_dir = os.path.join(str(tmpdir), 'sources')
+    os.makedirs(sources_dir, exist_ok=True)
+    config = {
+        'sourcedir': sources_dir
+    }
+    cli.configure(config)
+
+    result = cli.run(project=project, args=[
+        'source', 'fetch', 'target.bst'
+    ])
+    result.assert_success()
+
+    cache_dir_name = url_directory_name(source_config['url'])
+    full_cache_path = os.path.join(sources_dir, 'git', cache_dir_name)
+    shallow_cache_path = os.path.join(sources_dir, 'git', '{}-{}'.format(cache_dir_name, ref))
+
+    assert os.path.exists(shallow_cache_path)
+    assert not os.path.exists(full_cache_path)
+
+    output = subprocess.run(['git', 'log', '--format=format:%H'],
+                            cwd=shallow_cache_path,
+                            stdout=subprocess.PIPE).stdout.decode('ascii')
+    assert output.splitlines() == [ref]
+
+    workspace = os.path.join(str(tmpdir), 'workspace')
+
+    result = cli.run(project=project, args=[
+        'workspace', 'open', 'target.bst', '--directory', workspace
+    ])
+    result.assert_success()
+
+    output = subprocess.run(['git', 'log', '--format=format:%H'],
+                            cwd=workspace,
+                            stdout=subprocess.PIPE).stdout.decode('ascii')
+    assert output.splitlines() == [ref, previous_ref]