Skip to content

Commit

Permalink
Bug 562676: Push efficiency - report missing objects only.
Browse files Browse the repository at this point in the history
Fix with tests.
  • Loading branch information
Artem Tikhomirov authored and jelmer committed Dec 5, 2012
1 parent d3690e5 commit c1787f5
Show file tree
Hide file tree
Showing 3 changed files with 309 additions and 20 deletions.
132 changes: 112 additions & 20 deletions dulwich/object_store.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# object_store.py -- Object store for git objects
# Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org>
# Copyright (C) 2008-2012 Jelmer Vernooij <jelmer@samba.org>
# and others
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
Expand Down Expand Up @@ -220,6 +221,28 @@ def peel_sha(self, sha):
obj = self[sha]
return obj

def _collect_ancestors(self, heads, common = set()):
"""Collect all ancestors of heads up to (excluding) those in common
:param heads: commits to start from
:param common: commits to end at, or empty set to walk repository completely
:return: a tuple (A, B) where A - all commits reachable
from heads but not present in common, B - common (shared) elements
that are directly reachable from heads
"""
bases = set()
commits = set()
queue = []
queue.extend(heads)
while queue:
e = queue.pop(0)
if e in common:
bases.add(e)
elif e not in commits:
commits.add(e)
cmt = self[e]
queue.extend(cmt.parents)
return (commits, bases)


class PackBasedObjectStore(BaseObjectStore):

Expand Down Expand Up @@ -792,6 +815,52 @@ def tree_lookup_path(lookup_obj, root_sha, path):
raise NotTreeError(root_sha)
return tree.lookup_path(lookup_obj, path)

def _collect_filetree_revs(obj_store, tree_sha, kset):
"""Collect SHA1s of files and directories for specified tree
(identified by SHA1)
:param obj_store: Object store to get objects by SHA from
:param tree_sha: tree reference to walk
:param kset: set to fill with references to files and directories
"""
filetree = obj_store[tree_sha]
for name,mode,sha in filetree.iteritems():
if not S_ISGITLINK(mode) and sha not in kset:
kset.add(sha)
if stat.S_ISDIR(mode):
_collect_filetree_revs(obj_store, sha, kset)

def _split_commits_and_tags(obj_store, lst, ignore_unknown = False):
"""Split lst into two lists, one with commit SHA1s, another with
tag SHA1s. Commits referenced by tags are included into commits
list as well. Only SHA1s known in this repository will get
through, and unless ignore_unknown argument is True, KeyError
is thrown for SHA1 missing in the repository
:param obj_store: Object store to get objects by SHA1 from
:param lst: Collection of commit and tag SHAs
:param ignore_unknown: True to skip SHA1 missing in the
repository silently.
:return: A tuple of (commits, tags) SHA1s
"""
commits = set()
tags = set()
for e in lst:
try:
o = obj_store[e]
except KeyError:
if ignore_unknown:
pass
else:
raise
else:
if isinstance(o, Commit):
commits.add(e)
elif isinstance(o, Tag):
tags.add(e)
commits.add(o.object[1])
else:
raise KeyError('Not a commit or a tag: %s' % e)
return (commits, tags)


class MissingObjectFinder(object):
"""Find the objects missing from another object store.
Expand All @@ -808,11 +877,44 @@ class MissingObjectFinder(object):

def __init__(self, object_store, haves, wants, progress=None,
get_tagged=None):
haves = set(haves)
self.sha_done = haves
self.objects_to_send = set([(w, None, False) for w in wants
if w not in haves])
self.object_store = object_store
# process Commits and Tags differently
# Note, while haves may list commits/tags not available locally,
# and such SHAs would get filtered out by _split_commits_and_tags,
# wants shall list only known SHAs, and otherwise
# _split_commits_and_tags fails with KeyError
have_commits, have_tags = \
_split_commits_and_tags(object_store, haves, True)
want_commits, want_tags = \
_split_commits_and_tags(object_store, wants, False)
# all_ancestors is a set of commits that shall not be sent
# (complete repository up to 'haves')
all_ancestors = object_store._collect_ancestors(have_commits)[0]
# all_missing - complete set of commits between haves and wants
# common - commits from all_ancestors we hit into while
# traversing parent hierarchy of wants
missing_commits, common_commits = \
object_store._collect_ancestors(want_commits, all_ancestors)
self.sha_done = set()
# Now, fill sha_done with commits and revisions of
# files and directories known to be both locally
# and on target. Thus these commits and files
# won't get selected for fetch
for h in common_commits:
self.sha_done.add(h)
cmt = object_store[h]
_collect_filetree_revs(object_store, cmt.tree, self.sha_done)
# record tags we have as visited, too
for t in have_tags:
self.sha_done.add(t)

missing_tags = want_tags.difference(have_tags)
# in fact, what we 'want' is commits and tags
# we've found missing
wants = missing_commits.union(missing_tags)

self.objects_to_send = set([(w, None, False) for w in wants])

if progress is None:
self.progress = lambda x: None
else:
Expand All @@ -823,18 +925,6 @@ def add_todo(self, entries):
self.objects_to_send.update([e for e in entries
if not e[0] in self.sha_done])

def parse_tree(self, tree):
self.add_todo([(sha, name, not stat.S_ISDIR(mode))
for name, mode, sha in tree.iteritems()
if not S_ISGITLINK(mode)])

def parse_commit(self, commit):
self.add_todo([(commit.tree, "", False)])
self.add_todo([(p, None, False) for p in commit.parents])

def parse_tag(self, tag):
self.add_todo([(tag.object[1], None, False)])

def next(self):
while True:
if not self.objects_to_send:
Expand All @@ -845,11 +935,13 @@ def next(self):
if not leaf:
o = self.object_store[sha]
if isinstance(o, Commit):
self.parse_commit(o)
self.add_todo([(o.tree, "", False)])
elif isinstance(o, Tree):
self.parse_tree(o)
self.add_todo([(s, n, not stat.S_ISDIR(m))
for n, m, s in o.iteritems()
if not S_ISGITLINK(m)])
elif isinstance(o, Tag):
self.parse_tag(o)
self.add_todo([(o.object[1], None, False)])
if sha in self._tagged:
self.add_todo([(self._tagged[sha], None, True)])
self.sha_done.add(sha)
Expand Down
1 change: 1 addition & 0 deletions dulwich/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def self_test_suite():
'lru_cache',
'objects',
'object_store',
'missing_obj_finder',
'pack',
'patch',
'protocol',
Expand Down
196 changes: 196 additions & 0 deletions dulwich/tests/test_missing_obj_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
# test_missing_obj_finder.py -- tests for MissingObjectFinder
# Copyright (C) 2012 syntevo GmbH
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; version 2
# or (at your option) any later version of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.

from dulwich.errors import (
MissingCommitError,
)
from dulwich.object_store import (
MemoryObjectStore,
)
from dulwich.objects import (
Commit,
Blob,
)
from dulwich.tests import TestCase
from utils import (
F,
make_object,
build_commit_graph,
)

class MissingObjectFinderTest(TestCase):

def setUp(self):
super(MissingObjectFinderTest, self).setUp()
self.store = MemoryObjectStore()
self.commits = []

def __getitem__(self, n):
# rename for brevity
return self.commits[n-1]

def cmt(self, n):
return self[n]

def assertMissingMatch(self, haves, wants, expected):
for sha,path in self.store.find_missing_objects(haves, wants):
self.assertTrue(sha in expected, "FAILURE: (%s,%s) erroneously reported as missing" % (sha,path))
expected.remove(sha)

self.assertFalse(len(expected) > 0, "FAILURE: some objects are not reported as missing: %s" % (expected))

class MOFLinearRepoTest(MissingObjectFinderTest):
def setUp(self):
super(MOFLinearRepoTest, self).setUp()
f1_1 = make_object(Blob, data='f1') # present in 1, removed in 3
f2_1 = make_object(Blob, data='f2') # present in all revisions, changed in 2 and 3
f2_2 = make_object(Blob, data='f2-changed')
f2_3 = make_object(Blob, data='f2-changed-again')
f3_2 = make_object(Blob, data='f3') # added in 2, left unmodified in 3

commit_spec = [[1], [2,1], [3,2]]
trees = {1: [('f1', f1_1), ('f2',f2_1)],
2: [('f1',f1_1), ('f2', f2_2), ('f3', f3_2)],
3: [('f2', f2_3), ('f3', f3_2)] }
# commit 1: f1 and f2
# commit 2: f3 added, f2 changed. Missing shall report commit id and a tree referenced by commit
# commit 3: f1 removed, f2 changed. Commit sha and root tree sha shall be reported as modified
self.commits = build_commit_graph(self.store, commit_spec, trees)
self.missing_1_2 = [self.cmt(2).id, self.cmt(2).tree, f2_2.id, f3_2.id]
self.missing_2_3 = [self.cmt(3).id, self.cmt(3).tree, f2_3.id]
self.missing_1_3 = [
self.cmt(2).id, self.cmt(3).id,
self.cmt(2).tree, self.cmt(3).tree,
f2_2.id, f3_2.id, f2_3.id]


def test_1_to_2(self):
self.assertMissingMatch([self.cmt(1).id], [self.cmt(2).id], self.missing_1_2)

def test_2_to_3(self):
self.assertMissingMatch([self.cmt(2).id], [self.cmt(3).id], self.missing_2_3)

def test_1_to_3(self):
self.assertMissingMatch([self.cmt(1).id], [self.cmt(3).id], self.missing_1_3)

def test_bogus_haves_failure(self):
"""Ensure non-existent SHA in haves are not tolerated"""
bogus_sha = self.cmt(2).id[::-1]
haves = [self.cmt(1).id, bogus_sha]
wants = [self.cmt(3).id]
self.assertRaises(KeyError, self.store.find_missing_objects, self.store, haves, wants)

def test_bogus_wants_failure(self):
"""Ensure non-existent SHA in wants are not tolerated"""
bogus_sha = self.cmt(2).id[::-1]
haves = [self.cmt(1).id]
wants = [self.cmt(3).id, bogus_sha]
self.assertRaises(KeyError, self.store.find_missing_objects, self.store, haves, wants)

def test_no_changes(self):
self.assertMissingMatch([self.cmt(3).id], [self.cmt(3).id], [])

class MOFMergeForkRepoTest(MissingObjectFinderTest):
""" 1 --- 2 --- 4 --- 6 --- 7
\ /
3 ---
\
5
"""

def setUp(self):
super(MOFMergeForkRepoTest, self).setUp()
f1_1 = make_object(Blob, data='f1')
f1_2 = make_object(Blob, data='f1-2')
f1_4 = make_object(Blob, data='f1-4')
f1_7 = make_object(Blob, data='f1-2') # same data as in rev 2
f2_1 = make_object(Blob, data='f2')
f2_3 = make_object(Blob, data='f2-3')
f3_3 = make_object(Blob, data='f3')
f3_5 = make_object(Blob, data='f3-5')
commit_spec = [[1], [2,1], [3,2], [4,2], [5,3], [6,3,4], [7,6]]
trees = {1: [('f1', f1_1), ('f2',f2_1)],
2: [('f1',f1_2), ('f2', f2_1)], # f1 changed
3: [('f1',f1_2), ('f2', f2_3), ('f3', f3_3)], # f3 added, f2 changed
4: [('f1',f1_4), ('f2',f2_1)], # f1 changed
5: [('f1',f1_2), ('f3', f3_5)], # f2 removed, f3 changed
6: [('f1',f1_4), ('f2',f2_3), ('f3', f3_3)], # merged 3 and 4
7: [('f1',f1_7), ('f2',f2_3)]} # f1 changed to match rev2. f3 removed
self.commits = build_commit_graph(self.store, commit_spec, trees)

self.f1_2_id = f1_2.id
self.f1_4_id = f1_4.id
self.f1_7_id = f1_7.id
self.f2_3_id = f2_3.id
self.f3_3_id = f3_3.id

self.assertEquals(f1_2.id, f1_7.id, "[sanity]")

def test_have6_want7(self):
"""
have 6, want 7. Ideally, shall not report f1_7 as it's the same as f1_2,
however, to do so, MissingObjectFinder shall not record trees of common commits
only, but also all parent trees and tree items, which is an overkill
(i.e. in sha_done it records f1_4 as known, and doesn't record f1_2 was known
prior to that, hence can't detect f1_7 is in fact f1_2 and shall not be reported)
"""
self.assertMissingMatch([self.cmt(6).id], [self.cmt(7).id], [self.cmt(7).id, self.cmt(7).tree, self.f1_7_id])

def test_have4_want7(self):
"""
have 4, want 7. Shall not include rev5 as it is not in the tree between 4 and 7
(well, it is, but its SHA's are irrelevant for 4..7 commit hierarchy)
"""
self.assertMissingMatch([self.cmt(4).id], [self.cmt(7).id], [
self.cmt(7).id, self.cmt(6).id, self.cmt(3).id,
self.cmt(7).tree, self.cmt(6).tree, self.cmt(3).tree,
self.f2_3_id, self.f3_3_id])

def test_have1_want6(self):
"""
have 1, want 6. Shall not include rev5
"""
self.assertMissingMatch([self.cmt(1).id], [self.cmt(6).id], [
self.cmt(6).id, self.cmt(4).id, self.cmt(3).id, self.cmt(2).id,
self.cmt(6).tree, self.cmt(4).tree, self.cmt(3).tree, self.cmt(2).tree,
self.f1_2_id, self.f1_4_id, self.f2_3_id, self.f3_3_id])

def test_have3_want6(self):
"""
have 3, want 7. Shall not report rev2 and its tree, because
haves(3) means has parents, i.e. rev2, too
BUT shall report any changes descending rev2 (excluding rev3)
Shall NOT report f1_7 as it's techically == f1_2
"""
self.assertMissingMatch([self.cmt(3).id], [self.cmt(7).id], [
self.cmt(7).id, self.cmt(6).id, self.cmt(4).id,
self.cmt(7).tree, self.cmt(6).tree, self.cmt(4).tree,
self.f1_4_id])

def test_have5_want7(self):
"""
have 5, want 7. Common parent is rev2, hence children of rev2 from
a descent line other than rev5 shall be reported
"""
# expects f1_4 from rev6. f3_5 is known in rev5;
# f1_7 shall be the same as f1_2 (known, too)
self.assertMissingMatch([self.cmt(5).id], [self.cmt(7).id], [
self.cmt(7).id, self.cmt(6).id, self.cmt(4).id,
self.cmt(7).tree, self.cmt(6).tree, self.cmt(4).tree,
self.f1_4_id])

0 comments on commit c1787f5

Please sign in to comment.