Kiln » Dependencies » Dulwich Read More
Clone URL:  
Pushed to one repository · View In Graph Contained in master

Bug 562676: Push efficiency - report missing objects only.

Fix with tests.

Changeset 8de34e3d1f19

Parent d0a63d31eac3

committed by Jelmer Vernooij

authored by Artem Tikhomirov

Changes to 3 files · Browse files at 8de34e3d1f19 Showing diff from parent d0a63d31eac3 Diff from another changeset...

 
1
2
 
 
3
4
5
 
220
221
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
224
225
 
792
793
794
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
796
797
 
808
809
810
811
812
813
814
815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
817
818
 
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
 
845
846
847
848
 
849
850
 
 
 
851
852
 
853
854
855
 
1
 
2
3
4
5
6
 
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
 
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
 
877
878
879
 
 
 
 
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
 
924
925
926
 
 
 
 
 
 
 
 
 
 
 
 
927
928
929
 
935
936
937
 
938
939
 
940
941
942
943
 
944
945
946
947
@@ -1,5 +1,6 @@
 # object_store.py -- Object store for git objects -# Copyright (C) 2008-2009 Jelmer Vernooij <jelmer@samba.org> +# Copyright (C) 2008-2012 Jelmer Vernooij <jelmer@samba.org> +# and others  #  # This program is free software; you can redistribute it and/or  # modify it under the terms of the GNU General Public License @@ -220,6 +221,28 @@
  obj = self[sha]   return obj   + def _collect_ancestors(self, heads, common = set()): + """Collect all ancestors of heads up to (excluding) those in common + :param heads: commits to start from + :param common: commits to end at, or empty set to walk repository completely + :return: a tuple (A, B) where A - all commits reachable + from heads but not present in common, B - common (shared) elements + that are directly reachable from heads + """ + bases = set() + commits = set() + queue = [] + queue.extend(heads) + while queue: + e = queue.pop(0) + if e in common: + bases.add(e) + elif e not in commits: + commits.add(e) + cmt = self[e] + queue.extend(cmt.parents) + return (commits, bases) +    class PackBasedObjectStore(BaseObjectStore):   @@ -792,6 +815,52 @@
  raise NotTreeError(root_sha)   return tree.lookup_path(lookup_obj, path)   +def _collect_filetree_revs(obj_store, tree_sha, kset): + """Collect SHA1s of files and directories for specified tree + (identified by SHA1) + :param obj_store: Object store to get objects by SHA from + :param tree_sha: tree reference to walk + :param kset: set to fill with references to files and directories + """ + filetree = obj_store[tree_sha] + for name,mode,sha in filetree.iteritems(): + if not S_ISGITLINK(mode) and sha not in kset: + kset.add(sha) + if stat.S_ISDIR(mode): + _collect_filetree_revs(obj_store, sha, kset) + +def _split_commits_and_tags(obj_store, lst, ignore_unknown = False): + """Split lst into two lists, one with commit SHA1s, another with + tag SHA1s. Commits referenced by tags are included into commits + list as well. Only SHA1s known in this repository will get + through, and unless ignore_unknown argument is True, KeyError + is thrown for SHA1 missing in the repository + :param obj_store: Object store to get objects by SHA1 from + :param lst: Collection of commit and tag SHAs + :param ignore_unknown: True to skip SHA1 missing in the + repository silently. + :return: A tuple of (commits, tags) SHA1s + """ + commits = set() + tags = set() + for e in lst: + try: + o = obj_store[e] + except KeyError: + if ignore_unknown: + pass + else: + raise + else: + if isinstance(o, Commit): + commits.add(e) + elif isinstance(o, Tag): + tags.add(e) + commits.add(o.object[1]) + else: + raise KeyError('Not a commit or a tag: %s' % e) + return (commits, tags) +    class MissingObjectFinder(object):   """Find the objects missing from another object store. @@ -808,11 +877,44 @@
    def __init__(self, object_store, haves, wants, progress=None,   get_tagged=None): - haves = set(haves) - self.sha_done = haves - self.objects_to_send = set([(w, None, False) for w in wants - if w not in haves])   self.object_store = object_store + # process Commits and Tags differently + # Note, while haves may list commits/tags not available locally, + # and such SHAs would get filtered out by _split_commits_and_tags, + # wants shall list only known SHAs, and otherwise + # _split_commits_and_tags fails with KeyError + have_commits, have_tags = \ + _split_commits_and_tags(object_store, haves, True) + want_commits, want_tags = \ + _split_commits_and_tags(object_store, wants, False) + # all_ancestors is a set of commits that shall not be sent + # (complete repository up to 'haves') + all_ancestors = object_store._collect_ancestors(have_commits)[0] + # all_missing - complete set of commits between haves and wants + # common - commits from all_ancestors we hit into while + # traversing parent hierarchy of wants + missing_commits, common_commits = \ + object_store._collect_ancestors(want_commits, all_ancestors) + self.sha_done = set() + # Now, fill sha_done with commits and revisions of + # files and directories known to be both locally + # and on target. Thus these commits and files + # won't get selected for fetch + for h in common_commits: + self.sha_done.add(h) + cmt = object_store[h] + _collect_filetree_revs(object_store, cmt.tree, self.sha_done) + # record tags we have as visited, too + for t in have_tags: + self.sha_done.add(t) + + missing_tags = want_tags.difference(have_tags) + # in fact, what we 'want' is commits and tags + # we've found missing + wants = missing_commits.union(missing_tags) + + self.objects_to_send = set([(w, None, False) for w in wants]) +   if progress is None:   self.progress = lambda x: None   else: @@ -822,18 +924,6 @@
  def add_todo(self, entries):   self.objects_to_send.update([e for e in entries   if not e[0] in self.sha_done]) - - def parse_tree(self, tree): - self.add_todo([(sha, name, not stat.S_ISDIR(mode)) - for name, mode, sha in tree.iteritems() - if not S_ISGITLINK(mode)]) - - def parse_commit(self, commit): - self.add_todo([(commit.tree, "", False)]) - self.add_todo([(p, None, False) for p in commit.parents]) - - def parse_tag(self, tag): - self.add_todo([(tag.object[1], None, False)])     def next(self):   while True: @@ -845,11 +935,13 @@
  if not leaf:   o = self.object_store[sha]   if isinstance(o, Commit): - self.parse_commit(o) + self.add_todo([(o.tree, "", False)])   elif isinstance(o, Tree): - self.parse_tree(o) + self.add_todo([(s, n, not stat.S_ISDIR(m)) + for n, m, s in o.iteritems() + if not S_ISGITLINK(m)])   elif isinstance(o, Tag): - self.parse_tag(o) + self.add_todo([(o.object[1], None, False)])   if sha in self._tagged:   self.add_todo([(self._tagged[sha], None, True)])   self.sha_done.add(sha)
 
121
122
123
 
124
125
126
 
121
122
123
124
125
126
127
@@ -121,6 +121,7 @@
  'lru_cache',   'objects',   'object_store', + 'missing_obj_finder',   'pack',   'patch',   'protocol',
Change 1 of 1 Show Entire File dulwich/​tests/​test_missing_obj_finder.py Stacked
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
@@ -1,0 +1,196 @@
+# test_missing_obj_finder.py -- tests for MissingObjectFinder +# Copyright (C) 2012 syntevo GmbH +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; version 2 +# or (at your option) any later version of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. + +from dulwich.errors import ( + MissingCommitError, + ) +from dulwich.object_store import ( + MemoryObjectStore, + ) +from dulwich.objects import ( + Commit, + Blob, + ) +from dulwich.tests import TestCase +from utils import ( + F, + make_object, + build_commit_graph, + ) + +class MissingObjectFinderTest(TestCase): + + def setUp(self): + super(MissingObjectFinderTest, self).setUp() + self.store = MemoryObjectStore() + self.commits = [] + + def __getitem__(self, n): + # rename for brevity + return self.commits[n-1] + + def cmt(self, n): + return self[n] + + def assertMissingMatch(self, haves, wants, expected): + for sha,path in self.store.find_missing_objects(haves, wants): + self.assertTrue(sha in expected, "FAILURE: (%s,%s) erroneously reported as missing" % (sha,path)) + expected.remove(sha) + + self.assertFalse(len(expected) > 0, "FAILURE: some objects are not reported as missing: %s" % (expected)) + +class MOFLinearRepoTest(MissingObjectFinderTest): + def setUp(self): + super(MOFLinearRepoTest, self).setUp() + f1_1 = make_object(Blob, data='f1') # present in 1, removed in 3 + f2_1 = make_object(Blob, data='f2') # present in all revisions, changed in 2 and 3 + f2_2 = make_object(Blob, data='f2-changed') + f2_3 = make_object(Blob, data='f2-changed-again') + f3_2 = make_object(Blob, data='f3') # added in 2, left unmodified in 3 + + commit_spec = [[1], [2,1], [3,2]] + trees = {1: [('f1', f1_1), ('f2',f2_1)], + 2: [('f1',f1_1), ('f2', f2_2), ('f3', f3_2)], + 3: [('f2', f2_3), ('f3', f3_2)] } + # commit 1: f1 and f2 + # commit 2: f3 added, f2 changed. Missing shall report commit id and a tree referenced by commit + # commit 3: f1 removed, f2 changed. Commit sha and root tree sha shall be reported as modified + self.commits = build_commit_graph(self.store, commit_spec, trees) + self.missing_1_2 = [self.cmt(2).id, self.cmt(2).tree, f2_2.id, f3_2.id] + self.missing_2_3 = [self.cmt(3).id, self.cmt(3).tree, f2_3.id] + self.missing_1_3 = [ + self.cmt(2).id, self.cmt(3).id, + self.cmt(2).tree, self.cmt(3).tree, + f2_2.id, f3_2.id, f2_3.id] + + + def test_1_to_2(self): + self.assertMissingMatch([self.cmt(1).id], [self.cmt(2).id], self.missing_1_2) + + def test_2_to_3(self): + self.assertMissingMatch([self.cmt(2).id], [self.cmt(3).id], self.missing_2_3) + + def test_1_to_3(self): + self.assertMissingMatch([self.cmt(1).id], [self.cmt(3).id], self.missing_1_3) + + def test_bogus_haves_failure(self): + """Ensure non-existent SHA in haves are not tolerated""" + bogus_sha = self.cmt(2).id[::-1] + haves = [self.cmt(1).id, bogus_sha] + wants = [self.cmt(3).id] + self.assertRaises(KeyError, self.store.find_missing_objects, self.store, haves, wants) + + def test_bogus_wants_failure(self): + """Ensure non-existent SHA in wants are not tolerated""" + bogus_sha = self.cmt(2).id[::-1] + haves = [self.cmt(1).id] + wants = [self.cmt(3).id, bogus_sha] + self.assertRaises(KeyError, self.store.find_missing_objects, self.store, haves, wants) + + def test_no_changes(self): + self.assertMissingMatch([self.cmt(3).id], [self.cmt(3).id], []) + +class MOFMergeForkRepoTest(MissingObjectFinderTest): + """ 1 --- 2 --- 4 --- 6 --- 7 + \ / + 3 --- + \ + 5 + """ + + def setUp(self): + super(MOFMergeForkRepoTest, self).setUp() + f1_1 = make_object(Blob, data='f1') + f1_2 = make_object(Blob, data='f1-2') + f1_4 = make_object(Blob, data='f1-4') + f1_7 = make_object(Blob, data='f1-2') # same data as in rev 2 + f2_1 = make_object(Blob, data='f2') + f2_3 = make_object(Blob, data='f2-3') + f3_3 = make_object(Blob, data='f3') + f3_5 = make_object(Blob, data='f3-5') + commit_spec = [[1], [2,1], [3,2], [4,2], [5,3], [6,3,4], [7,6]] + trees = {1: [('f1', f1_1), ('f2',f2_1)], + 2: [('f1',f1_2), ('f2', f2_1)], # f1 changed + 3: [('f1',f1_2), ('f2', f2_3), ('f3', f3_3)], # f3 added, f2 changed + 4: [('f1',f1_4), ('f2',f2_1)], # f1 changed + 5: [('f1',f1_2), ('f3', f3_5)], # f2 removed, f3 changed + 6: [('f1',f1_4), ('f2',f2_3), ('f3', f3_3)], # merged 3 and 4 + 7: [('f1',f1_7), ('f2',f2_3)]} # f1 changed to match rev2. f3 removed + self.commits = build_commit_graph(self.store, commit_spec, trees) + + self.f1_2_id = f1_2.id + self.f1_4_id = f1_4.id + self.f1_7_id = f1_7.id + self.f2_3_id = f2_3.id + self.f3_3_id = f3_3.id + + self.assertEquals(f1_2.id, f1_7.id, "[sanity]") + + def test_have6_want7(self): + """ + have 6, want 7. Ideally, shall not report f1_7 as it's the same as f1_2, + however, to do so, MissingObjectFinder shall not record trees of common commits + only, but also all parent trees and tree items, which is an overkill + (i.e. in sha_done it records f1_4 as known, and doesn't record f1_2 was known + prior to that, hence can't detect f1_7 is in fact f1_2 and shall not be reported) + """ + self.assertMissingMatch([self.cmt(6).id], [self.cmt(7).id], [self.cmt(7).id, self.cmt(7).tree, self.f1_7_id]) + + def test_have4_want7(self): + """ + have 4, want 7. Shall not include rev5 as it is not in the tree between 4 and 7 + (well, it is, but its SHA's are irrelevant for 4..7 commit hierarchy) + """ + self.assertMissingMatch([self.cmt(4).id], [self.cmt(7).id], [ + self.cmt(7).id, self.cmt(6).id, self.cmt(3).id, + self.cmt(7).tree, self.cmt(6).tree, self.cmt(3).tree, + self.f2_3_id, self.f3_3_id]) + + def test_have1_want6(self): + """ + have 1, want 6. Shall not include rev5 + """ + self.assertMissingMatch([self.cmt(1).id], [self.cmt(6).id], [ + self.cmt(6).id, self.cmt(4).id, self.cmt(3).id, self.cmt(2).id, + self.cmt(6).tree, self.cmt(4).tree, self.cmt(3).tree, self.cmt(2).tree, + self.f1_2_id, self.f1_4_id, self.f2_3_id, self.f3_3_id]) + + def test_have3_want6(self): + """ + have 3, want 7. Shall not report rev2 and its tree, because + haves(3) means has parents, i.e. rev2, too + BUT shall report any changes descending rev2 (excluding rev3) + Shall NOT report f1_7 as it's techically == f1_2 + """ + self.assertMissingMatch([self.cmt(3).id], [self.cmt(7).id], [ + self.cmt(7).id, self.cmt(6).id, self.cmt(4).id, + self.cmt(7).tree, self.cmt(6).tree, self.cmt(4).tree, + self.f1_4_id]) + + def test_have5_want7(self): + """ + have 5, want 7. Common parent is rev2, hence children of rev2 from + a descent line other than rev5 shall be reported + """ + # expects f1_4 from rev6. f3_5 is known in rev5; + # f1_7 shall be the same as f1_2 (known, too) + self.assertMissingMatch([self.cmt(5).id], [self.cmt(7).id], [ + self.cmt(7).id, self.cmt(6).id, self.cmt(4).id, + self.cmt(7).tree, self.cmt(6).tree, self.cmt(4).tree, + self.f1_4_id]) +