Kiln » Dependencies » Dulwich Read More
Clone URL:  
Pushed to one repository · View In Graph Contained in master

Lazily read the contents of ShaFiles from disk.

Previously, ShaFile.from_file read and inflated the entire contents of a
ShaFile, storing the inflated text in memory. Parsing that text was done
lazily, but this resulted in a confusing performance profile, since the
"lazy" part was in fact faster (memory/CPU-bound) than the eager part
(I/O-bound).

This change ensures that a file is only read fully from disk right
before it is about to be parsed. The first few bytes are still read
from disk to get the file type, but there is no need to slurp the
whole file. Moreover, we now maintain a distinction between ShaFiles
that are initialized from (possibly raw) strings and files initialized
with filenames. In order to parse the contents of a file, we need to
have one or the other. There is a third category of ShaFiles that
should never be parsed, where all the properties are intended to be set
explicitly (usually (but not always) during testing). These objects are
created as usual via __init__.

The tests all pass, but we do not currently have a way of testing the
point at which the implementation reads the whole file from disk.

Also, note that a call to foo.id still results in reading the whole file
and calculating its SHA-1; this behavior may change in the future.

Change-Id: Ib9f67ead0d2a812a2e43cffa54c37a1c4a219841

Changeset 79b760a98784

Parent 2782f6edb956

committed by Jelmer Vernooij

authored by Dave Borowitz

Changes to 3 files · Browse files at 79b760a98784 Showing diff from parent 2782f6edb956 Diff from another changeset...

 
378
379
380
381
 
382
383
384
 
378
379
380
 
381
382
383
384
@@ -378,7 +378,7 @@
  path = self._get_shafile_path(sha)   try:   return ShaFile.from_file(path) - except OSError, e: + except (OSError, IOError), e:   if e.errno == errno.ENOENT:   return None   raise
Change 1 of 13 Show Entire File dulwich/​objects.py Stacked
 
99
100
101
102
 
 
103
104
 
105
106
107
 
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
154
155
 
162
163
164
165
 
 
 
166
167
168
169
170
 
181
182
183
 
 
 
184
185
186
 
195
196
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
 
 
 
 
215
216
217
218
219
220
 
 
 
 
221
222
 
223
224
225
226
 
 
 
 
227
228
229
 
231
232
233
 
 
 
 
 
 
 
 
 
 
 
234
235
236
237
 
238
239
240
241
242
 
 
 
 
 
 
 
243
244
245
 
267
268
269
270
 
271
272
273
 
367
368
369
370
 
371
372
 
373
374
375
376
 
 
 
 
 
 
 
 
 
377
378
379
 
424
425
426
427
428
429
430
431
 
434
435
436
437
438
439
440
441
442
443
444
445
446
 
600
601
602
603
604
605
606
607
 
668
669
670
671
672
673
674
 
746
747
748
749
750
751
752
753
 
99
100
101
 
102
103
104
 
105
106
107
108
 
125
126
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
 
171
172
173
 
174
175
176
177
 
178
179
180
 
191
192
193
194
195
196
197
198
199
 
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
241
242
243
244
245
 
 
 
 
246
247
248
249
250
 
251
252
253
254
255
256
257
258
259
260
261
262
 
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
 
 
280
281
282
 
 
 
283
284
285
286
287
288
289
290
291
292
 
314
315
316
 
317
318
319
320
 
414
415
416
 
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
 
481
482
483
 
 
484
485
486
 
489
490
491
 
 
 
 
 
 
 
492
493
494
 
648
649
650
 
 
651
652
653
 
714
715
716
 
717
718
719
 
791
792
793
 
 
794
795
796
@@ -99,9 +99,10 @@
  """Get the object class corresponding to the given type.     :param type: Either a type name string or a numeric type. - :return: The ShaFile subclass corresponding to the given type. + :return: The ShaFile subclass corresponding to the given type, or None if + type is not a valid type name/number.   """ - return _TYPE_MAP[type] + return _TYPE_MAP.get(type, None)      def check_hexsha(hex, error_msg): @@ -124,32 +125,40 @@
 class ShaFile(object):   """A git SHA file."""   - @classmethod - def _parse_legacy_object(cls, map): - """Parse a legacy object, creating it and setting object._text""" - text = _decompress(map) - object = None - for cls in OBJECT_CLASSES: - if text.startswith(cls.type_name): - object = cls() - text = text[len(cls.type_name):] - break - assert object is not None, "%s is not a known object type" % text[:9] - assert text[0] == ' ', "%s is not a space" % text[0] - text = text[1:] - size = 0 - i = 0 - while text[0] >= '0' and text[0] <= '9': - if i > 0 and size == 0: - raise AssertionError("Size is not in canonical format") - size = (size * 10) + int(text[0]) - text = text[1:] - i += 1 - object._size = size - assert text[0] == "\0", "Size not followed by null" - text = text[1:] - object.set_raw_string(text) - return object + @staticmethod + def _parse_legacy_object_header(magic, f): + """Parse a legacy object, creating it but not reading the file.""" + bufsize = 1024 + decomp = zlib.decompressobj() + header = decomp.decompress(magic) + start = 0 + end = -1 + while end < 0: + header += decomp.decompress(f.read(bufsize)) + end = header.find("\0", start) + start = len(header) + header = header[:end] + type_name, size = header.split(" ", 1) + size = int(size) # sanity check + obj_class = object_class(type_name) + if not obj_class: + raise ObjectFormatException("Not a known type: %s" % type_name) + obj = obj_class() + obj._filename = f.name + return obj + + def _parse_legacy_object(self, f): + """Parse a legacy object, setting the raw string.""" + size = os.path.getsize(f.name) + map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ) + try: + text = _decompress(map) + finally: + map.close() + header_end = text.find('\0') + if header_end < 0: + raise ObjectFormatException("Invalid object header") + self.set_raw_string(text[header_end+1:])     def as_legacy_object_chunks(self):   compobj = zlib.compressobj() @@ -162,9 +171,10 @@
  return "".join(self.as_legacy_object_chunks())     def as_raw_chunks(self): - if self._needs_serialization: + if self._needs_parsing: + self._ensure_parsed() + else:   self._chunked_text = self._serialize() - self._needs_serialization = False   return self._chunked_text     def as_raw_string(self): @@ -181,6 +191,9 @@
    def _ensure_parsed(self):   if self._needs_parsing: + if not self._chunked_text: + assert self._filename, "ShaFile needs either text or filename" + self._parse_file()   self._deserialize(self._chunked_text)   self._needs_parsing = False   @@ -195,35 +208,55 @@
  self._needs_parsing = True   self._needs_serialization = False   + @staticmethod + def _parse_object_header(magic, f): + """Parse a new style object, creating it but not reading the file.""" + num_type = (ord(magic[0]) >> 4) & 7 + obj_class = object_class(num_type) + if not obj_class: + raise ObjectFormatError("Not a known type: %d" % num_type) + obj = obj_class() + obj._filename = f.name + return obj + + def _parse_object(self, f): + """Parse a new style object, setting self._text.""" + size = os.path.getsize(f.name) + map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ) + try: + # skip type and size; type must have already been determined, and we + # trust zlib to fail if it's otherwise corrupted + byte = ord(map[0]) + used = 1 + while (byte & 0x80) != 0: + byte = ord(map[used]) + used += 1 + raw = map[used:] + self.set_raw_string(_decompress(raw)) + finally: + map.close() +   @classmethod - def _parse_object(cls, map): - """Parse a new style object , creating it and setting object._text""" - used = 0 - byte = ord(map[used]) - used += 1 - type_num = (byte >> 4) & 7 - try: - object = object_class(type_num)() - except KeyError: - raise AssertionError("Not a known type: %d" % type_num) - while (byte & 0x80) != 0: - byte = ord(map[used]) - used += 1 - raw = map[used:] - object.set_raw_string(_decompress(raw)) - return object + def _is_legacy_object(cls, magic): + b0, b1 = map(ord, magic) + word = (b0 << 8) + b1 + return b0 == 0x78 and (word % 31) == 0     @classmethod - def _parse_file(cls, map): - word = (ord(map[0]) << 8) + ord(map[1]) - if ord(map[0]) == 0x78 and (word % 31) == 0: - return cls._parse_legacy_object(map) + def _parse_file_header(cls, f): + magic = f.read(2) + if cls._is_legacy_object(magic): + return cls._parse_legacy_object_header(magic, f)   else: - return cls._parse_object(map) + return cls._parse_object_header(magic, f)     def __init__(self):   """Don't call this directly"""   self._sha = None + self._filename = None + self._chunked_text = [] + self._needs_parsing = False + self._needs_serialization = True     def _deserialize(self, chunks):   raise NotImplementedError(self._deserialize) @@ -231,15 +264,29 @@
  def _serialize(self):   raise NotImplementedError(self._serialize)   + def _parse_file(self): + f = GitFile(self._filename, 'rb') + try: + magic = f.read(2) + if self._is_legacy_object(magic): + self._parse_legacy_object(f) + else: + self._parse_object(f) + finally: + f.close() +   @classmethod   def from_file(cls, filename): - """Get the contents of a SHA file on disk""" - size = os.path.getsize(filename) + """Get the contents of a SHA file on disk."""   f = GitFile(filename, 'rb')   try: - map = mmap.mmap(f.fileno(), size, access=mmap.ACCESS_READ) - shafile = cls._parse_file(map) - return shafile + try: + obj = cls._parse_file_header(f) + obj._needs_parsing = True + obj._needs_serialization = True + return obj + except (IndexError, ValueError), e: + raise ObjectFormatException("invalid object header")   finally:   f.close()   @@ -267,7 +314,7 @@
    @classmethod   def from_string(cls, string): - """Create a blob from a string.""" + """Create a ShaFile from a string."""   obj = cls()   obj.set_raw_string(string)   return obj @@ -367,13 +414,23 @@
  self.set_raw_string(data)     data = property(_get_data, _set_data, - "The text contained within the blob object.") + "The text contained within the blob object.")     def _get_chunked(self): + self._ensure_parsed()   return self._chunked_text     def _set_chunked(self, chunks):   self._chunked_text = chunks + + def _serialize(self): + if not self._chunked_text: + self._ensure_parsed() + self._needs_serialization = False + return self._chunked_text + + def _deserialize(self, chunks): + return "".join(chunks)     chunked = property(_get_chunked, _set_chunked,   "The text within the blob object, as chunks (not necessarily lines).") @@ -424,8 +481,6 @@
    def __init__(self):   super(Tag, self).__init__() - self._needs_parsing = False - self._needs_serialization = True   self._tag_timezone_neg_utc = False     @classmethod @@ -434,13 +489,6 @@
  if not isinstance(tag, cls):   raise NotTagError(filename)   return tag - - @classmethod - def from_string(cls, string): - """Create a blob from a string.""" - shafile = cls() - shafile.set_raw_string(string) - return shafile     def check(self):   """Check this object for internal consistency. @@ -600,8 +648,6 @@
  def __init__(self):   super(Tree, self).__init__()   self._entries = {} - self._needs_parsing = False - self._needs_serialization = True     @classmethod   def from_file(cls, filename): @@ -668,7 +714,6 @@
  # TODO: list comprehension is for efficiency in the common (small) case;   # if memory efficiency in the large case is a concern, use a genexp.   self._entries = dict([(n, (m, s)) for n, m, s in parsed_entries]) - self._needs_parsing = False     def check(self):   """Check this object for internal consistency. @@ -746,8 +791,6 @@
  super(Commit, self).__init__()   self._parents = []   self._encoding = None - self._needs_parsing = False - self._needs_serialization = True   self._extra = {}   self._author_timezone_neg_utc = False   self._commit_timezone_neg_utc = False
 
212
213
214
215
 
 
216
217
218
219
 
 
220
221
222
 
343
344
345
346
347
348
 
 
 
349
350
351
352
 
 
353
354
355
 
356
357
 
358
359
360
361
 
 
 
362
363
364
 
369
370
371
372
 
373
374
 
375
376
377
 
382
383
384
385
 
386
387
 
388
389
390
 
406
407
408
 
409
410
411
 
418
419
420
421
 
422
423
424
 
530
531
532
533
534
535
536
537
538
 
 
 
 
 
 
539
540
 
541
542
 
543
544
545
 
546
547
548
549
550
551
552
 
553
554
555
 
560
561
562
563
 
564
565
 
566
567
568
 
212
213
214
 
215
216
217
218
219
 
220
221
222
223
224
 
345
346
347
 
 
 
348
349
350
351
352
 
 
353
354
355
356
 
357
358
 
359
360
 
 
 
361
362
363
364
365
366
 
371
372
373
 
374
375
 
376
377
378
379
 
384
385
386
 
387
388
 
389
390
391
392
 
408
409
410
411
412
413
414
 
421
422
423
 
424
425
426
427
 
533
534
535
 
 
 
 
 
 
536
537
538
539
540
541
542
 
543
544
 
545
546
547
 
548
549
550
551
552
553
554
 
555
556
557
558
 
563
564
565
 
566
567
 
568
569
570
571
@@ -212,11 +212,13 @@
   class ShaFileCheckTests(unittest.TestCase):   - def assertCheckFails(self, obj, data): + def assertCheckFails(self, cls, data): + obj = cls()   obj.set_raw_string(data)   self.assertRaises(ObjectFormatException, obj.check)   - def assertCheckSucceeds(self, obj, data): + def assertCheckSucceeds(self, cls, data): + obj = cls()   obj.set_raw_string(data)   try:   obj.check() @@ -343,22 +345,22 @@
  self.assertEquals('UTF-8', c.encoding)     def test_check(self): - self.assertCheckSucceeds(Commit(), self.make_commit_text()) - self.assertCheckSucceeds(Commit(), self.make_commit_text(parents=None)) - self.assertCheckSucceeds(Commit(), + self.assertCheckSucceeds(Commit, self.make_commit_text()) + self.assertCheckSucceeds(Commit, self.make_commit_text(parents=None)) + self.assertCheckSucceeds(Commit,   self.make_commit_text(encoding='UTF-8'))   - self.assertCheckFails(Commit(), self.make_commit_text(tree='xxx')) - self.assertCheckFails(Commit(), self.make_commit_text( + self.assertCheckFails(Commit, self.make_commit_text(tree='xxx')) + self.assertCheckFails(Commit, self.make_commit_text(   parents=[a_sha, 'xxx']))   bad_committer = "some guy without an email address 1174773719 +0000" - self.assertCheckFails(Commit(), + self.assertCheckFails(Commit,   self.make_commit_text(committer=bad_committer)) - self.assertCheckFails(Commit(), + self.assertCheckFails(Commit,   self.make_commit_text(author=bad_committer)) - self.assertCheckFails(Commit(), self.make_commit_text(author=None)) - self.assertCheckFails(Commit(), self.make_commit_text(committer=None)) - self.assertCheckFails(Commit(), self.make_commit_text( + self.assertCheckFails(Commit, self.make_commit_text(author=None)) + self.assertCheckFails(Commit, self.make_commit_text(committer=None)) + self.assertCheckFails(Commit, self.make_commit_text(   author=None, committer=None))     def test_check_duplicates(self): @@ -369,9 +371,9 @@
  text = '\n'.join(lines)   if lines[i].startswith('parent'):   # duplicate parents are ok for now - self.assertCheckSucceeds(Commit(), text) + self.assertCheckSucceeds(Commit, text)   else: - self.assertCheckFails(Commit(), text) + self.assertCheckFails(Commit, text)     def test_check_order(self):   lines = self.make_commit_lines(parents=[a_sha], encoding='UTF-8') @@ -382,9 +384,9 @@
  perm = list(perm)   text = '\n'.join(perm + rest)   if perm == headers: - self.assertCheckSucceeds(Commit(), text) + self.assertCheckSucceeds(Commit, text)   else: - self.assertCheckFails(Commit(), text) + self.assertCheckFails(Commit, text)      class TreeTests(ShaFileCheckTests): @@ -406,6 +408,7 @@
  def _do_test_parse_tree(self, parse_tree):   o = Tree.from_file(os.path.join(os.path.dirname(__file__), 'data',   'trees', tree_sha)) + o._parse_file()   self.assertEquals([('a', 0100644, a_sha), ('b', 0100644, b_sha)],   list(parse_tree(o.as_raw_string())))   @@ -418,7 +421,7 @@
  self._do_test_parse_tree(parse_tree)     def test_check(self): - t = Tree() + t = Tree   sha = hex_to_sha(a_sha)     # filenames @@ -530,26 +533,26 @@
  self.assertEquals("v2.6.22-rc7", x.name)     def test_check(self): - self.assertCheckSucceeds(Tag(), self.make_tag_text()) - self.assertCheckFails(Tag(), self.make_tag_text(object_sha=None)) - self.assertCheckFails(Tag(), self.make_tag_text(object_type_name=None)) - self.assertCheckFails(Tag(), self.make_tag_text(name=None)) - self.assertCheckFails(Tag(), self.make_tag_text(name='')) - self.assertCheckFails(Tag(), self.make_tag_text( + self.assertCheckSucceeds(Tag, self.make_tag_text()) + self.assertCheckFails(Tag, self.make_tag_text(object_sha=None)) + self.assertCheckFails(Tag, self.make_tag_text(object_type_name=None)) + self.assertCheckFails(Tag, self.make_tag_text(name=None)) + self.assertCheckFails(Tag, self.make_tag_text(name='')) + self.assertCheckFails(Tag, self.make_tag_text(   object_type_name="foobar")) - self.assertCheckFails(Tag(), self.make_tag_text( + self.assertCheckFails(Tag, self.make_tag_text(   tagger="some guy without an email address 1183319674 -0700")) - self.assertCheckFails(Tag(), self.make_tag_text( + self.assertCheckFails(Tag, self.make_tag_text(   tagger=("Linus Torvalds <torvalds@woody.linux-foundation.org> "   "Sun 7 Jul 2007 12:54:34 +0700"))) - self.assertCheckFails(Tag(), self.make_tag_text(object_sha="xxx")) + self.assertCheckFails(Tag, self.make_tag_text(object_sha="xxx"))     def test_check_duplicates(self):   # duplicate each of the header fields   for i in xrange(4):   lines = self.make_tag_lines()   lines.insert(i, lines[i]) - self.assertCheckFails(Tag(), '\n'.join(lines)) + self.assertCheckFails(Tag, '\n'.join(lines))     def test_check_order(self):   lines = self.make_tag_lines() @@ -560,9 +563,9 @@
  perm = list(perm)   text = '\n'.join(perm + rest)   if perm == headers: - self.assertCheckSucceeds(Tag(), text) + self.assertCheckSucceeds(Tag, text)   else: - self.assertCheckFails(Tag(), text) + self.assertCheckFails(Tag, text)      class CheckTests(unittest.TestCase):