gitdb-0.5.4/0000755000175100017510000000000011604623714011663 5ustar byronbyrongitdb-0.5.4/LICENSE0000644000175100017510000000370011604532766012676 0ustar byronbyronCopyright (C) 2010, 2011 Sebastian Thiel and contributors All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the GitDB project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Additional Licenses ------------------- The files at gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.idx and gitdb/test/fixtures/packs/pack-11fdfa9e156ab73caae3b6da867192221f2089c2.pack are licensed under GNU GPL as part of the git source repository, see http://en.wikipedia.org/wiki/Git_%28software%29 for more information. They are not required for the actual operation, which is why they are not found in the distribution package. gitdb-0.5.4/PKG-INFO0000644000175100017510000000052311604623714012760 0ustar byronbyronMetadata-Version: 1.1 Name: gitdb Version: 0.5.4 Summary: Git Object Database Home-page: https://github.com/gitpython-developers/gitdb Author: Sebastian Thiel Author-email: byronimo@gmail.com License: BSD License Description: GitDB is a pure-Python git object database Platform: UNKNOWN Requires: async (>=0.6.1) Requires: smmap (>=0.8.0) gitdb-0.5.4/setup.py0000755000175100017510000000647711604623357013421 0ustar byronbyron#!/usr/bin/env python from distutils.core import setup, Extension from distutils.command.build_py import build_py from distutils.command.build_ext import build_ext import os, sys # wow, this is a mixed bag ... I am pretty upset about all of this ... setuptools_build_py_module = None try: # don't pull it in if we don't have to if 'setuptools' in sys.modules: import setuptools.command.build_py as setuptools_build_py_module from setuptools.command.build_ext import build_ext except ImportError: pass class build_ext_nofail(build_ext): """Doesn't fail when build our optional extensions""" def run(self): try: build_ext.run(self) except Exception: print "Ignored failure when building extensions, pure python modules will be used instead" # END ignore errors def get_data_files(self): """Can you feel the pain ? So, in python2.5 and python2.4 coming with maya, the line dealing with the ``plen`` has a bug which causes it to truncate too much. It is fixed in the system interpreters as they receive patches, and shows how bad it is if something doesn't have proper unittests. The code here is a plain copy of the python2.6 version which works for all. Generate list of '(package,src_dir,build_dir,filenames)' tuples""" data = [] if not self.packages: return data # this one is just for the setup tools ! They don't iniitlialize this variable # when they should, but do it on demand using this method.Its crazy if hasattr(self, 'analyze_manifest'): self.analyze_manifest() # END handle setuptools ... for package in self.packages: # Locate package source directory src_dir = self.get_package_dir(package) # Compute package build directory build_dir = os.path.join(*([self.build_lib] + package.split('.'))) # Length of path to strip from found files plen = 0 if src_dir: plen = len(src_dir)+1 # Strip directory from globbed filenames filenames = [ file[plen:] for file in self.find_data_files(package, src_dir) ] data.append((package, src_dir, build_dir, filenames)) return data build_py.get_data_files = get_data_files if setuptools_build_py_module: setuptools_build_py_module.build_py._get_data_files = get_data_files # END apply setuptools patch too # NOTE: This is currently duplicated from the gitdb.__init__ module, as we cannot # satisfy the dependencies at installation time, unfortunately, due to inherent limitations # of distutils, which cannot install the prerequesites of a package before the acutal package. __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/gitpython-developers/gitdb" version_info = (0, 5, 4) __version__ = '.'.join(str(i) for i in version_info) setup(cmdclass={'build_ext':build_ext_nofail}, name = "gitdb", version = __version__, description = "Git Object Database", author = __author__, author_email = __contact__, url = __homepage__, packages = ('gitdb', 'gitdb.db'), package_dir = {'gitdb':'gitdb'}, ext_modules=[Extension('gitdb._perf', ['gitdb/_fun.c', 'gitdb/_delta_apply.c'], include_dirs=['gitdb'])], license = "BSD License", zip_safe=False, requires=('async (>=0.6.1)', 'smmap (>=0.8.0)'), install_requires=('async >= 0.6.1', 'smmap >= 0.8.0'), long_description = """GitDB is a pure-Python git object database""" ) gitdb-0.5.4/AUTHORS0000644000175100017510000000003111573623553012733 0ustar byronbyronCreator: Sebastian Thiel gitdb-0.5.4/gitdb/0000755000175100017510000000000011604623714012754 5ustar byronbyrongitdb-0.5.4/gitdb/base.py0000644000175100017510000001634611573623553014260 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Module with basic data structures - they are designed to be lightweight and fast""" from util import ( bin_to_hex, zlib ) from fun import ( type_id_to_type_map, type_to_type_id_map ) __all__ = ('OInfo', 'OPackInfo', 'ODeltaPackInfo', 'OStream', 'OPackStream', 'ODeltaPackStream', 'IStream', 'InvalidOInfo', 'InvalidOStream' ) #{ ODB Bases class OInfo(tuple): """Carries information about an object in an ODB, provding information about the binary sha of the object, the type_string as well as the uncompressed size in bytes. It can be accessed using tuple notation and using attribute access notation:: assert dbi[0] == dbi.binsha assert dbi[1] == dbi.type assert dbi[2] == dbi.size The type is designed to be as lighteight as possible.""" __slots__ = tuple() def __new__(cls, sha, type, size): return tuple.__new__(cls, (sha, type, size)) def __init__(self, *args): tuple.__init__(self) #{ Interface @property def binsha(self): """:return: our sha as binary, 20 bytes""" return self[0] @property def hexsha(self): """:return: our sha, hex encoded, 40 bytes""" return bin_to_hex(self[0]) @property def type(self): return self[1] @property def type_id(self): return type_to_type_id_map[self[1]] @property def size(self): return self[2] #} END interface class OPackInfo(tuple): """As OInfo, but provides a type_id property to retrieve the numerical type id, and does not include a sha. Additionally, the pack_offset is the absolute offset into the packfile at which all object information is located. The data_offset property points to the abosolute location in the pack at which that actual data stream can be found.""" __slots__ = tuple() def __new__(cls, packoffset, type, size): return tuple.__new__(cls, (packoffset,type, size)) def __init__(self, *args): tuple.__init__(self) #{ Interface @property def pack_offset(self): return self[0] @property def type(self): return type_id_to_type_map[self[1]] @property def type_id(self): return self[1] @property def size(self): return self[2] #} END interface class ODeltaPackInfo(OPackInfo): """Adds delta specific information, Either the 20 byte sha which points to some object in the database, or the negative offset from the pack_offset, so that pack_offset - delta_info yields the pack offset of the base object""" __slots__ = tuple() def __new__(cls, packoffset, type, size, delta_info): return tuple.__new__(cls, (packoffset, type, size, delta_info)) #{ Interface @property def delta_info(self): return self[3] #} END interface class OStream(OInfo): """Base for object streams retrieved from the database, providing additional information about the stream. Generally, ODB streams are read-only as objects are immutable""" __slots__ = tuple() def __new__(cls, sha, type, size, stream, *args, **kwargs): """Helps with the initialization of subclasses""" return tuple.__new__(cls, (sha, type, size, stream)) def __init__(self, *args, **kwargs): tuple.__init__(self) #{ Stream Reader Interface def read(self, size=-1): return self[3].read(size) @property def stream(self): return self[3] #} END stream reader interface class ODeltaStream(OStream): """Uses size info of its stream, delaying reads""" def __new__(cls, sha, type, size, stream, *args, **kwargs): """Helps with the initialization of subclasses""" return tuple.__new__(cls, (sha, type, size, stream)) #{ Stream Reader Interface @property def size(self): return self[3].size #} END stream reader interface class OPackStream(OPackInfo): """Next to pack object information, a stream outputting an undeltified base object is provided""" __slots__ = tuple() def __new__(cls, packoffset, type, size, stream, *args): """Helps with the initialization of subclasses""" return tuple.__new__(cls, (packoffset, type, size, stream)) #{ Stream Reader Interface def read(self, size=-1): return self[3].read(size) @property def stream(self): return self[3] #} END stream reader interface class ODeltaPackStream(ODeltaPackInfo): """Provides a stream outputting the uncompressed offset delta information""" __slots__ = tuple() def __new__(cls, packoffset, type, size, delta_info, stream): return tuple.__new__(cls, (packoffset, type, size, delta_info, stream)) #{ Stream Reader Interface def read(self, size=-1): return self[4].read(size) @property def stream(self): return self[4] #} END stream reader interface class IStream(list): """Represents an input content stream to be fed into the ODB. It is mutable to allow the ODB to record information about the operations outcome right in this instance. It provides interfaces for the OStream and a StreamReader to allow the instance to blend in without prior conversion. The only method your content stream must support is 'read'""" __slots__ = tuple() def __new__(cls, type, size, stream, sha=None): return list.__new__(cls, (sha, type, size, stream, None)) def __init__(self, type, size, stream, sha=None): list.__init__(self, (sha, type, size, stream, None)) #{ Interface @property def hexsha(self): """:return: our sha, hex encoded, 40 bytes""" return bin_to_hex(self[0]) def _error(self): """:return: the error that occurred when processing the stream, or None""" return self[4] def _set_error(self, exc): """Set this input stream to the given exc, may be None to reset the error""" self[4] = exc error = property(_error, _set_error) #} END interface #{ Stream Reader Interface def read(self, size=-1): """Implements a simple stream reader interface, passing the read call on to our internal stream""" return self[3].read(size) #} END stream reader interface #{ interface def _set_binsha(self, binsha): self[0] = binsha def _binsha(self): return self[0] binsha = property(_binsha, _set_binsha) def _type(self): return self[1] def _set_type(self, type): self[1] = type type = property(_type, _set_type) def _size(self): return self[2] def _set_size(self, size): self[2] = size size = property(_size, _set_size) def _stream(self): return self[3] def _set_stream(self, stream): self[3] = stream stream = property(_stream, _set_stream) #} END odb info interface class InvalidOInfo(tuple): """Carries information about a sha identifying an object which is invalid in the queried database. The exception attribute provides more information about the cause of the issue""" __slots__ = tuple() def __new__(cls, sha, exc): return tuple.__new__(cls, (sha, exc)) def __init__(self, sha, exc): tuple.__init__(self, (sha, exc)) @property def binsha(self): return self[0] @property def hexsha(self): return bin_to_hex(self[0]) @property def error(self): """:return: exception instance explaining the failure""" return self[1] class InvalidOStream(InvalidOInfo): """Carries information about an invalid ODB stream""" __slots__ = tuple() #} END ODB Bases gitdb-0.5.4/gitdb/exc.py0000644000175100017510000000207611573623553014120 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Module with common exceptions""" from util import to_hex_sha class ODBError(Exception): """All errors thrown by the object database""" class InvalidDBRoot(ODBError): """Thrown if an object database cannot be initialized at the given path""" class BadObject(ODBError): """The object with the given SHA does not exist. Instantiate with the failed sha""" def __str__(self): return "BadObject: %s" % to_hex_sha(self.args[0]) class ParseError(ODBError): """Thrown if the parsing of a file failed due to an invalid format""" class AmbiguousObjectName(ODBError): """Thrown if a possibly shortened name does not uniquely represent a single object in the database""" class BadObjectType(ODBError): """The object had an unsupported type""" class UnsupportedOperation(ODBError): """Thrown if the given operation cannot be supported by the object database""" gitdb-0.5.4/gitdb/_delta_apply.h0000644000175100017510000000030111573623553015562 0ustar byronbyron#include extern PyObject* connect_deltas(PyObject *self, PyObject *dstreams); extern PyObject* apply_delta(PyObject* self, PyObject* args); extern PyTypeObject DeltaChunkListType; gitdb-0.5.4/gitdb/_fun.c0000644000175100017510000000521011573623553014053 0ustar byronbyron#include #include "_delta_apply.h" static PyObject *PackIndexFile_sha_to_index(PyObject *self, PyObject *args) { const unsigned char *sha; const unsigned int sha_len; // Note: self is only set if we are a c type. We emulate an instance method, // hence we have to get the instance as 'first' argument // get instance and sha PyObject* inst = 0; if (!PyArg_ParseTuple(args, "Os#", &inst, &sha, &sha_len)) return NULL; if (sha_len != 20) { PyErr_SetString(PyExc_ValueError, "Sha is not 20 bytes long"); return NULL; } if( !inst){ PyErr_SetString(PyExc_ValueError, "Cannot be called without self"); return NULL; } // read lo and hi bounds PyObject* fanout_table = PyObject_GetAttrString(inst, "_fanout_table"); if (!fanout_table){ PyErr_SetString(PyExc_ValueError, "Couldn't obtain fanout table"); return NULL; } unsigned int lo = 0, hi = 0; if (sha[0]){ PyObject* item = PySequence_GetItem(fanout_table, (const Py_ssize_t)(sha[0]-1)); lo = PyInt_AS_LONG(item); Py_DECREF(item); } PyObject* item = PySequence_GetItem(fanout_table, (const Py_ssize_t)sha[0]); hi = PyInt_AS_LONG(item); Py_DECREF(item); item = 0; Py_DECREF(fanout_table); // get sha query function PyObject* get_sha = PyObject_GetAttrString(inst, "sha"); if (!get_sha){ PyErr_SetString(PyExc_ValueError, "Couldn't obtain sha method"); return NULL; } PyObject *sha_str = 0; while (lo < hi) { const int mid = (lo + hi)/2; sha_str = PyObject_CallFunction(get_sha, "i", mid); if (!sha_str) { return NULL; } // we really trust that string ... for speed const int cmp = memcmp(PyString_AS_STRING(sha_str), sha, 20); Py_DECREF(sha_str); sha_str = 0; if (cmp < 0){ lo = mid + 1; } else if (cmp > 0) { hi = mid; } else { Py_DECREF(get_sha); return PyInt_FromLong(mid); }// END handle comparison }// END while lo < hi // nothing found, cleanup Py_DECREF(get_sha); Py_RETURN_NONE; } static PyMethodDef py_fun[] = { { "PackIndexFile_sha_to_index", (PyCFunction)PackIndexFile_sha_to_index, METH_VARARGS, "TODO" }, { "connect_deltas", (PyCFunction)connect_deltas, METH_O, "See python implementation" }, { "apply_delta", (PyCFunction)apply_delta, METH_VARARGS, "See python implementation" }, { NULL, NULL, 0, NULL } }; #ifndef PyMODINIT_FUNC /* declarations for DLL import/export */ #define PyMODINIT_FUNC void #endif PyMODINIT_FUNC init_perf(void) { PyObject *m; if (PyType_Ready(&DeltaChunkListType) < 0) return; m = Py_InitModule3("_perf", py_fun, NULL); if (m == NULL) return; Py_INCREF(&DeltaChunkListType); PyModule_AddObject(m, "DeltaChunkList", (PyObject *)&DeltaChunkListType); } gitdb-0.5.4/gitdb/fun.py0000644000175100017510000004562111575506132014127 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Contains basic c-functions which usually contain performance critical code Keeping this code separate from the beginning makes it easier to out-source it into c later, if required""" from exc import ( BadObjectType ) from util import zlib decompressobj = zlib.decompressobj import mmap from itertools import islice, izip from cStringIO import StringIO # INVARIANTS OFS_DELTA = 6 REF_DELTA = 7 delta_types = (OFS_DELTA, REF_DELTA) type_id_to_type_map = { 0 : "", # EXT 1 1 : "commit", 2 : "tree", 3 : "blob", 4 : "tag", 5 : "", # EXT 2 OFS_DELTA : "OFS_DELTA", # OFFSET DELTA REF_DELTA : "REF_DELTA" # REFERENCE DELTA } type_to_type_id_map = dict( commit=1, tree=2, blob=3, tag=4, OFS_DELTA=OFS_DELTA, REF_DELTA=REF_DELTA ) # used when dealing with larger streams chunk_size = 1000*mmap.PAGESIZE __all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info', 'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data', 'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header') #{ Structures def _set_delta_rbound(d, size): """Truncate the given delta to the given size :param size: size relative to our target offset, may not be 0, must be smaller or equal to our size :return: d""" d.ts = size # NOTE: data is truncated automatically when applying the delta # MUST NOT DO THIS HERE return d def _move_delta_lbound(d, bytes): """Move the delta by the given amount of bytes, reducing its size so that its right bound stays static :param bytes: amount of bytes to move, must be smaller than delta size :return: d""" if bytes == 0: return d.to += bytes d.so += bytes d.ts -= bytes if d.data is not None: d.data = d.data[bytes:] # END handle data return d def delta_duplicate(src): return DeltaChunk(src.to, src.ts, src.so, src.data) def delta_chunk_apply(dc, bbuf, write): """Apply own data to the target buffer :param bbuf: buffer providing source bytes for copy operations :param write: write method to call with data to write""" if dc.data is None: # COPY DATA FROM SOURCE write(buffer(bbuf, dc.so, dc.ts)) else: # APPEND DATA # whats faster: if + 4 function calls or just a write with a slice ? # Considering data can be larger than 127 bytes now, it should be worth it if dc.ts < len(dc.data): write(dc.data[:dc.ts]) else: write(dc.data) # END handle truncation # END handle chunk mode class DeltaChunk(object): """Represents a piece of a delta, it can either add new data, or copy existing one from a source buffer""" __slots__ = ( 'to', # start offset in the target buffer in bytes 'ts', # size of this chunk in the target buffer in bytes 'so', # start offset in the source buffer in bytes or None 'data', # chunk of bytes to be added to the target buffer, # DeltaChunkList to use as base, or None ) def __init__(self, to, ts, so, data): self.to = to self.ts = ts self.so = so self.data = data def __repr__(self): return "DeltaChunk(%i, %i, %s, %s)" % (self.to, self.ts, self.so, self.data or "") #{ Interface def rbound(self): return self.to + self.ts def has_data(self): """:return: True if the instance has data to add to the target stream""" return self.data is not None #} END interface def _closest_index(dcl, absofs): """:return: index at which the given absofs should be inserted. The index points to the DeltaChunk with a target buffer absofs that equals or is greater than absofs. **Note:** global method for performance only, it belongs to DeltaChunkList""" lo = 0 hi = len(dcl) while lo < hi: mid = (lo + hi) / 2 dc = dcl[mid] if dc.to > absofs: hi = mid elif dc.rbound() > absofs or dc.to == absofs: return mid else: lo = mid + 1 # END handle bound # END for each delta absofs return len(dcl)-1 def delta_list_apply(dcl, bbuf, write): """Apply the chain's changes and write the final result using the passed write function. :param bbuf: base buffer containing the base of all deltas contained in this list. It will only be used if the chunk in question does not have a base chain. :param write: function taking a string of bytes to write to the output""" for dc in dcl: delta_chunk_apply(dc, bbuf, write) # END for each dc def delta_list_slice(dcl, absofs, size, ndcl): """:return: Subsection of this list at the given absolute offset, with the given size in bytes. :return: None""" cdi = _closest_index(dcl, absofs) # delta start index cd = dcl[cdi] slen = len(dcl) lappend = ndcl.append if cd.to != absofs: tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) _move_delta_lbound(tcd, absofs - cd.to) tcd.ts = min(tcd.ts, size) lappend(tcd) size -= tcd.ts cdi += 1 # END lbound overlap handling while cdi < slen and size: # are we larger than the current block cd = dcl[cdi] if cd.ts <= size: lappend(DeltaChunk(cd.to, cd.ts, cd.so, cd.data)) size -= cd.ts else: tcd = DeltaChunk(cd.to, cd.ts, cd.so, cd.data) tcd.ts = size lappend(tcd) size -= tcd.ts break # END hadle size cdi += 1 # END for each chunk class DeltaChunkList(list): """List with special functionality to deal with DeltaChunks. There are two types of lists we represent. The one was created bottom-up, working towards the latest delta, the other kind was created top-down, working from the latest delta down to the earliest ancestor. This attribute is queryable after all processing with is_reversed.""" __slots__ = tuple() def rbound(self): """:return: rightmost extend in bytes, absolute""" if len(self) == 0: return 0 return self[-1].rbound() def lbound(self): """:return: leftmost byte at which this chunklist starts""" if len(self) == 0: return 0 return self[0].to def size(self): """:return: size of bytes as measured by our delta chunks""" return self.rbound() - self.lbound() def apply(self, bbuf, write): """Only used by public clients, internally we only use the global routines for performance""" return delta_list_apply(self, bbuf, write) def compress(self): """Alter the list to reduce the amount of nodes. Currently we concatenate add-chunks :return: self""" slen = len(self) if slen < 2: return self i = 0 slen_orig = slen first_data_index = None while i < slen: dc = self[i] i += 1 if dc.data is None: if first_data_index is not None and i-2-first_data_index > 1: #if first_data_index is not None: nd = StringIO() # new data so = self[first_data_index].to # start offset in target buffer for x in xrange(first_data_index, i-1): xdc = self[x] nd.write(xdc.data[:xdc.ts]) # END collect data del(self[first_data_index:i-1]) buf = nd.getvalue() self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) slen = len(self) i = first_data_index + 1 # END concatenate data first_data_index = None continue # END skip non-data chunks if first_data_index is None: first_data_index = i-1 # END iterate list #if slen_orig != len(self): # print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100) return self def check_integrity(self, target_size=-1): """Verify the list has non-overlapping chunks only, and the total size matches target_size :param target_size: if not -1, the total size of the chain must be target_size :raise AssertionError: if the size doen't match""" if target_size > -1: assert self[-1].rbound() == target_size assert reduce(lambda x,y: x+y, (d.ts for d in self), 0) == target_size # END target size verification if len(self) < 2: return # check data for dc in self: assert dc.ts > 0 if dc.has_data(): assert len(dc.data) >= dc.ts # END for each dc left = islice(self, 0, len(self)-1) right = iter(self) right.next() # this is very pythonic - we might have just use index based access here, # but this could actually be faster for lft,rgt in izip(left, right): assert lft.rbound() == rgt.to assert lft.to + lft.ts == rgt.to # END for each pair class TopdownDeltaChunkList(DeltaChunkList): """Represents a list which is generated by feeding its ancestor streams one by one""" __slots__ = tuple() def connect_with_next_base(self, bdcl): """Connect this chain with the next level of our base delta chunklist. The goal in this game is to mark as many of our chunks rigid, hence they cannot be changed by any of the upcoming bases anymore. Once all our chunks are marked like that, we can stop all processing :param bdcl: data chunk list being one of our bases. They must be fed in consequtively and in order, towards the earliest ancestor delta :return: True if processing was done. Use it to abort processing of remaining streams if False is returned""" nfc = 0 # number of frozen chunks dci = 0 # delta chunk index slen = len(self) # len of self ccl = list() # temporary list while dci < slen: dc = self[dci] dci += 1 # all add-chunks which are already topmost don't need additional processing if dc.data is not None: nfc += 1 continue # END skip add chunks # copy chunks # integrate the portion of the base list into ourselves. Lists # dont support efficient insertion ( just one at a time ), but for now # we live with it. Internally, its all just a 32/64bit pointer, and # the portions of moved memory should be smallish. Maybe we just rebuild # ourselves in order to reduce the amount of insertions ... del(ccl[:]) delta_list_slice(bdcl, dc.so, dc.ts, ccl) # move the target bounds into place to match with our chunk ofs = dc.to - dc.so for cdc in ccl: cdc.to += ofs # END update target bounds if len(ccl) == 1: self[dci-1] = ccl[0] else: # maybe try to compute the expenses here, and pick the right algorithm # It would normally be faster than copying everything physically though # TODO: Use a deque here, and decide by the index whether to extend # or extend left ! post_dci = self[dci:] del(self[dci-1:]) # include deletion of dc self.extend(ccl) self.extend(post_dci) slen = len(self) dci += len(ccl)-1 # deleted dc, added rest # END handle chunk replacement # END for each chunk if nfc == slen: return False # END handle completeness return True #} END structures #{ Routines def is_loose_object(m): """ :return: True the file contained in memory map m appears to be a loose object. Only the first two bytes are needed""" b0, b1 = map(ord, m[:2]) word = (b0 << 8) + b1 return b0 == 0x78 and (word % 31) == 0 def loose_object_header_info(m): """ :return: tuple(type_string, uncompressed_size_in_bytes) the type string of the object as well as its uncompressed size in bytes. :param m: memory map from which to read the compressed object data""" decompress_size = 8192 # is used in cgit as well hdr = decompressobj().decompress(m, decompress_size) type_name, size = hdr[:hdr.find("\0")].split(" ") return type_name, int(size) def pack_object_header_info(data): """ :return: tuple(type_id, uncompressed_size_in_bytes, byte_offset) The type_id should be interpreted according to the ``type_id_to_type_map`` map The byte-offset specifies the start of the actual zlib compressed datastream :param m: random-access memory, like a string or memory map""" c = ord(data[0]) # first byte i = 1 # next char to read type_id = (c >> 4) & 7 # numeric type size = c & 15 # starting size s = 4 # starting bit-shift size while c & 0x80: c = ord(data[i]) i += 1 size += (c & 0x7f) << s s += 7 # END character loop return (type_id, size, i) def create_pack_object_header(obj_type, obj_size): """ :return: string defining the pack header comprised of the object type and its incompressed size in bytes :param obj_type: pack type_id of the object :param obj_size: uncompressed size in bytes of the following object stream""" c = 0 # 1 byte hdr = str() # output string c = (obj_type << 4) | (obj_size & 0xf) obj_size >>= 4 while obj_size: hdr += chr(c | 0x80) c = obj_size & 0x7f obj_size >>= 7 #END until size is consumed hdr += chr(c) return hdr def msb_size(data, offset=0): """ :return: tuple(read_bytes, size) read the msb size from the given random access data starting at the given byte offset""" size = 0 i = 0 l = len(data) hit_msb = False while i < l: c = ord(data[i+offset]) size |= (c & 0x7f) << i*7 i += 1 if not c & 0x80: hit_msb = True break # END check msb bit # END while in range if not hit_msb: raise AssertionError("Could not find terminating MSB byte in data stream") return i+offset, size def loose_object_header(type, size): """ :return: string representing the loose object header, which is immediately followed by the content stream of size 'size'""" return "%s %i\0" % (type, size) def write_object(type, size, read, write, chunk_size=chunk_size): """ Write the object as identified by type, size and source_stream into the target_stream :param type: type string of the object :param size: amount of bytes to write from source_stream :param read: read method of a stream providing the content data :param write: write method of the output stream :param close_target_stream: if True, the target stream will be closed when the routine exits, even if an error is thrown :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" tbw = 0 # total num bytes written # WRITE HEADER: type SP size NULL tbw += write(loose_object_header(type, size)) tbw += stream_copy(read, write, size, chunk_size) return tbw def stream_copy(read, write, size, chunk_size): """ Copy a stream up to size bytes using the provided read and write methods, in chunks of chunk_size **Note:** its much like stream_copy utility, but operates just using methods""" dbw = 0 # num data bytes written # WRITE ALL DATA UP TO SIZE while True: cs = min(chunk_size, size-dbw) # NOTE: not all write methods return the amount of written bytes, like # mmap.write. Its bad, but we just deal with it ... perhaps its not # even less efficient # data_len = write(read(cs)) # dbw += data_len data = read(cs) data_len = len(data) dbw += data_len write(data) if data_len < cs or dbw == size: break # END check for stream end # END duplicate data return dbw def connect_deltas(dstreams): """ Read the condensed delta chunk information from dstream and merge its information into a list of existing delta chunks :param dstreams: iterable of delta stream objects, the delta to be applied last comes first, then all its ancestors in order :return: DeltaChunkList, containing all operations to apply""" tdcl = None # topmost dcl dcl = tdcl = TopdownDeltaChunkList() for dsi, ds in enumerate(dstreams): # print "Stream", dsi db = ds.read() delta_buf_size = ds.size # read header i, base_size = msb_size(db) i, target_size = msb_size(db, i) # interpret opcodes tbw = 0 # amount of target bytes written while i < delta_buf_size: c = ord(db[i]) i += 1 if c & 0x80: cp_off, cp_size = 0, 0 if (c & 0x01): cp_off = ord(db[i]) i += 1 if (c & 0x02): cp_off |= (ord(db[i]) << 8) i += 1 if (c & 0x04): cp_off |= (ord(db[i]) << 16) i += 1 if (c & 0x08): cp_off |= (ord(db[i]) << 24) i += 1 if (c & 0x10): cp_size = ord(db[i]) i += 1 if (c & 0x20): cp_size |= (ord(db[i]) << 8) i += 1 if (c & 0x40): cp_size |= (ord(db[i]) << 16) i += 1 if not cp_size: cp_size = 0x10000 rbound = cp_off + cp_size if (rbound < cp_size or rbound > base_size): break dcl.append(DeltaChunk(tbw, cp_size, cp_off, None)) tbw += cp_size elif c: # NOTE: in C, the data chunks should probably be concatenated here. # In python, we do it as a post-process dcl.append(DeltaChunk(tbw, c, 0, db[i:i+c])) i += c tbw += c else: raise ValueError("unexpected delta opcode 0") # END handle command byte # END while processing delta data dcl.compress() # merge the lists ! if dsi > 0: if not tdcl.connect_with_next_base(dcl): break # END handle merge # prepare next base dcl = DeltaChunkList() # END for each delta stream return tdcl def apply_delta_data(src_buf, src_buf_size, delta_buf, delta_buf_size, write): """ Apply data from a delta buffer using a source buffer to the target file :param src_buf: random access data from which the delta was created :param src_buf_size: size of the source buffer in bytes :param delta_buf_size: size fo the delta buffer in bytes :param delta_buf: random access delta data :param write: write method taking a chunk of bytes **Note:** transcribed to python from the similar routine in patch-delta.c""" i = 0 db = delta_buf while i < delta_buf_size: c = ord(db[i]) i += 1 if c & 0x80: cp_off, cp_size = 0, 0 if (c & 0x01): cp_off = ord(db[i]) i += 1 if (c & 0x02): cp_off |= (ord(db[i]) << 8) i += 1 if (c & 0x04): cp_off |= (ord(db[i]) << 16) i += 1 if (c & 0x08): cp_off |= (ord(db[i]) << 24) i += 1 if (c & 0x10): cp_size = ord(db[i]) i += 1 if (c & 0x20): cp_size |= (ord(db[i]) << 8) i += 1 if (c & 0x40): cp_size |= (ord(db[i]) << 16) i += 1 if not cp_size: cp_size = 0x10000 rbound = cp_off + cp_size if (rbound < cp_size or rbound > src_buf_size): break write(buffer(src_buf, cp_off, cp_size)) elif c: write(db[i:i+c]) i += c else: raise ValueError("unexpected delta opcode 0") # END handle command byte # END while processing delta data # yes, lets use the exact same error message that git uses :) assert i == delta_buf_size, "delta replay has gone wild" def is_equal_canonical_sha(canonical_length, match, sha1): """ :return: True if the given lhs and rhs 20 byte binary shas The comparison will take the canonical_length of the match sha into account, hence the comparison will only use the last 4 bytes for uneven canonical representations :param match: less than 20 byte sha :param sha1: 20 byte sha""" binary_length = canonical_length/2 if match[:binary_length] != sha1[:binary_length]: return False if canonical_length - binary_length and \ (ord(match[-1]) ^ ord(sha1[len(match)-1])) & 0xf0: return False # END handle uneven canonnical length return True #} END routines try: # raise ImportError; # DEBUG from _perf import connect_deltas except ImportError: pass gitdb-0.5.4/gitdb/stream.py0000644000175100017510000005473711575506132014642 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php from cStringIO import StringIO import errno import mmap import os from fun import ( msb_size, stream_copy, apply_delta_data, connect_deltas, DeltaChunkList, delta_types ) from util import ( allocate_memory, LazyMixin, make_sha, write, close, zlib ) has_perf_mod = False try: from _perf import apply_delta as c_apply_delta has_perf_mod = True except ImportError: pass __all__ = ( 'DecompressMemMapReader', 'FDCompressedSha1Writer', 'DeltaApplyReader', 'Sha1Writer', 'FlexibleSha1Writer', 'ZippedStoreShaWriter', 'FDCompressedSha1Writer', 'FDStream', 'NullStream') #{ RO Streams class DecompressMemMapReader(LazyMixin): """Reads data in chunks from a memory map and decompresses it. The client sees only the uncompressed data, respective file-like read calls are handling on-demand buffered decompression accordingly A constraint on the total size of bytes is activated, simulating a logical file within a possibly larger physical memory area To read efficiently, you clearly don't want to read individual bytes, instead, read a few kilobytes at least. **Note:** The chunk-size should be carefully selected as it will involve quite a bit of string copying due to the way the zlib is implemented. Its very wasteful, hence we try to find a good tradeoff between allocation time and number of times we actually allocate. An own zlib implementation would be good here to better support streamed reading - it would only need to keep the mmap and decompress it into chunks, thats all ... """ __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close', '_cbr', '_phi') max_read_size = 512*1024 # currently unused def __init__(self, m, close_on_deletion, size=None): """Initialize with mmap for stream reading :param m: must be content data - use new if you have object data and no size""" self._m = m self._zip = zlib.decompressobj() self._buf = None # buffer of decompressed bytes self._buflen = 0 # length of bytes in buffer if size is not None: self._s = size # size of uncompressed data to read in total self._br = 0 # num uncompressed bytes read self._cws = 0 # start byte of compression window self._cwe = 0 # end byte of compression window self._cbr = 0 # number of compressed bytes read self._phi = False # is True if we parsed the header info self._close = close_on_deletion # close the memmap on deletion ? def _set_cache_(self, attr): assert attr == '_s' # only happens for size, which is a marker to indicate we still # have to parse the header from the stream self._parse_header_info() def __del__(self): if self._close: self._m.close() # END handle resource freeing def _parse_header_info(self): """If this stream contains object data, parse the header info and skip the stream to a point where each read will yield object content :return: parsed type_string, size""" # read header maxb = 512 # should really be enough, cgit uses 8192 I believe self._s = maxb hdr = self.read(maxb) hdrend = hdr.find("\0") type, size = hdr[:hdrend].split(" ") size = int(size) self._s = size # adjust internal state to match actual header length that we ignore # The buffer will be depleted first on future reads self._br = 0 hdrend += 1 # count terminating \0 self._buf = StringIO(hdr[hdrend:]) self._buflen = len(hdr) - hdrend self._phi = True return type, size #{ Interface @classmethod def new(self, m, close_on_deletion=False): """Create a new DecompressMemMapReader instance for acting as a read-only stream This method parses the object header from m and returns the parsed type and size, as well as the created stream instance. :param m: memory map on which to oparate. It must be object data ( header + contents ) :param close_on_deletion: if True, the memory map will be closed once we are being deleted""" inst = DecompressMemMapReader(m, close_on_deletion, 0) type, size = inst._parse_header_info() return type, size, inst def data(self): """:return: random access compatible data we are working on""" return self._m def compressed_bytes_read(self): """ :return: number of compressed bytes read. This includes the bytes it took to decompress the header ( if there was one )""" # ABSTRACT: When decompressing a byte stream, it can be that the first # x bytes which were requested match the first x bytes in the loosely # compressed datastream. This is the worst-case assumption that the reader # does, it assumes that it will get at least X bytes from X compressed bytes # in call cases. # The caveat is that the object, according to our known uncompressed size, # is already complete, but there are still some bytes left in the compressed # stream that contribute to the amount of compressed bytes. # How can we know that we are truly done, and have read all bytes we need # to read ? # Without help, we cannot know, as we need to obtain the status of the # decompression. If it is not finished, we need to decompress more data # until it is finished, to yield the actual number of compressed bytes # belonging to the decompressed object # We are using a custom zlib module for this, if its not present, # we try to put in additional bytes up for decompression if feasible # and check for the unused_data. # Only scrub the stream forward if we are officially done with the # bytes we were to have. if self._br == self._s and not self._zip.unused_data: # manipulate the bytes-read to allow our own read method to coninute # but keep the window at its current position self._br = 0 if hasattr(self._zip, 'status'): while self._zip.status == zlib.Z_OK: self.read(mmap.PAGESIZE) # END scrub-loop custom zlib else: # pass in additional pages, until we have unused data while not self._zip.unused_data and self._cbr != len(self._m): self.read(mmap.PAGESIZE) # END scrub-loop default zlib # END handle stream scrubbing # reset bytes read, just to be sure self._br = self._s # END handle stream scrubbing # unused data ends up in the unconsumed tail, which was removed # from the count already return self._cbr #} END interface def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): """Allows to reset the stream to restart reading :raise ValueError: If offset and whence are not 0""" if offset != 0 or whence != getattr(os, 'SEEK_SET', 0): raise ValueError("Can only seek to position 0") # END handle offset self._zip = zlib.decompressobj() self._br = self._cws = self._cwe = self._cbr = 0 if self._phi: self._phi = False del(self._s) # trigger header parsing on first access # END skip header def read(self, size=-1): if size < 1: size = self._s - self._br else: size = min(size, self._s - self._br) # END clamp size if size == 0: return str() # END handle depletion # deplete the buffer, then just continue using the decompress object # which has an own buffer. We just need this to transparently parse the # header from the zlib stream dat = str() if self._buf: if self._buflen >= size: # have enough data dat = self._buf.read(size) self._buflen -= size self._br += size return dat else: dat = self._buf.read() # ouch, duplicates data size -= self._buflen self._br += self._buflen self._buflen = 0 self._buf = None # END handle buffer len # END handle buffer # decompress some data # Abstract: zlib needs to operate on chunks of our memory map ( which may # be large ), as it will otherwise and always fill in the 'unconsumed_tail' # attribute which possible reads our whole map to the end, forcing # everything to be read from disk even though just a portion was requested. # As this would be a nogo, we workaround it by passing only chunks of data, # moving the window into the memory map along as we decompress, which keeps # the tail smaller than our chunk-size. This causes 'only' the chunk to be # copied once, and another copy of a part of it when it creates the unconsumed # tail. We have to use it to hand in the appropriate amount of bytes durin g # the next read. tail = self._zip.unconsumed_tail if tail: # move the window, make it as large as size demands. For code-clarity, # we just take the chunk from our map again instead of reusing the unconsumed # tail. The latter one would safe some memory copying, but we could end up # with not getting enough data uncompressed, so we had to sort that out as well. # Now we just assume the worst case, hence the data is uncompressed and the window # needs to be as large as the uncompressed bytes we want to read. self._cws = self._cwe - len(tail) self._cwe = self._cws + size else: cws = self._cws self._cws = self._cwe self._cwe = cws + size # END handle tail # if window is too small, make it larger so zip can decompress something if self._cwe - self._cws < 8: self._cwe = self._cws + 8 # END adjust winsize # takes a slice, but doesn't copy the data, it says ... indata = buffer(self._m, self._cws, self._cwe - self._cws) # get the actual window end to be sure we don't use it for computations self._cwe = self._cws + len(indata) dcompdat = self._zip.decompress(indata, size) # update the amount of compressed bytes read # We feed possibly overlapping chunks, which is why the unconsumed tail # has to be taken into consideration, as well as the unused data # if we hit the end of the stream self._cbr += len(indata) - len(self._zip.unconsumed_tail) self._br += len(dcompdat) if dat: dcompdat = dat + dcompdat # END prepend our cached data # it can happen, depending on the compression, that we get less bytes # than ordered as it needs the final portion of the data as well. # Recursively resolve that. # Note: dcompdat can be empty even though we still appear to have bytes # to read, if we are called by compressed_bytes_read - it manipulates # us to empty the stream if dcompdat and (len(dcompdat) - len(dat)) < size and self._br < self._s: dcompdat += self.read(size-len(dcompdat)) # END handle special case return dcompdat class DeltaApplyReader(LazyMixin): """A reader which dynamically applies pack deltas to a base object, keeping the memory demands to a minimum. The size of the final object is only obtainable once all deltas have been applied, unless it is retrieved from a pack index. The uncompressed Delta has the following layout (MSB being a most significant bit encoded dynamic size): * MSB Source Size - the size of the base against which the delta was created * MSB Target Size - the size of the resulting data after the delta was applied * A list of one byte commands (cmd) which are followed by a specific protocol: * cmd & 0x80 - copy delta_data[offset:offset+size] * Followed by an encoded offset into the delta data * Followed by an encoded size of the chunk to copy * cmd & 0x7f - insert * insert cmd bytes from the delta buffer into the output stream * cmd == 0 - invalid operation ( or error in delta stream ) """ __slots__ = ( "_bstream", # base stream to which to apply the deltas "_dstreams", # tuple of delta stream readers "_mm_target", # memory map of the delta-applied data "_size", # actual number of bytes in _mm_target "_br" # number of bytes read ) #{ Configuration k_max_memory_move = 250*1000*1000 #} END configuration def __init__(self, stream_list): """Initialize this instance with a list of streams, the first stream being the delta to apply on top of all following deltas, the last stream being the base object onto which to apply the deltas""" assert len(stream_list) > 1, "Need at least one delta and one base stream" self._bstream = stream_list[-1] self._dstreams = tuple(stream_list[:-1]) self._br = 0 def _set_cache_too_slow_without_c(self, attr): # the direct algorithm is fastest and most direct if there is only one # delta. Also, the extra overhead might not be worth it for items smaller # than X - definitely the case in python, every function call costs # huge amounts of time # if len(self._dstreams) * self._bstream.size < self.k_max_memory_move: if len(self._dstreams) == 1: return self._set_cache_brute_(attr) # Aggregate all deltas into one delta in reverse order. Hence we take # the last delta, and reverse-merge its ancestor delta, until we receive # the final delta data stream. # print "Handling %i delta streams, sizes: %s" % (len(self._dstreams), [ds.size for ds in self._dstreams]) dcl = connect_deltas(self._dstreams) # call len directly, as the (optional) c version doesn't implement the sequence # protocol if dcl.rbound() == 0: self._size = 0 self._mm_target = allocate_memory(0) return # END handle empty list self._size = dcl.rbound() self._mm_target = allocate_memory(self._size) bbuf = allocate_memory(self._bstream.size) stream_copy(self._bstream.read, bbuf.write, self._bstream.size, 256 * mmap.PAGESIZE) # APPLY CHUNKS write = self._mm_target.write dcl.apply(bbuf, write) self._mm_target.seek(0) def _set_cache_brute_(self, attr): """If we are here, we apply the actual deltas""" # TODO: There should be a special case if there is only one stream # Then the default-git algorithm should perform a tad faster, as the # delta is not peaked into, causing less overhead. buffer_info_list = list() max_target_size = 0 for dstream in self._dstreams: buf = dstream.read(512) # read the header information + X offset, src_size = msb_size(buf) offset, target_size = msb_size(buf, offset) buffer_info_list.append((buffer(buf, offset), offset, src_size, target_size)) max_target_size = max(max_target_size, target_size) # END for each delta stream # sanity check - the first delta to apply should have the same source # size as our actual base stream base_size = self._bstream.size target_size = max_target_size # if we have more than 1 delta to apply, we will swap buffers, hence we must # assure that all buffers we use are large enough to hold all the results if len(self._dstreams) > 1: base_size = target_size = max(base_size, max_target_size) # END adjust buffer sizes # Allocate private memory map big enough to hold the first base buffer # We need random access to it bbuf = allocate_memory(base_size) stream_copy(self._bstream.read, bbuf.write, base_size, 256 * mmap.PAGESIZE) # allocate memory map large enough for the largest (intermediate) target # We will use it as scratch space for all delta ops. If the final # target buffer is smaller than our allocated space, we just use parts # of it upon return. tbuf = allocate_memory(target_size) # for each delta to apply, memory map the decompressed delta and # work on the op-codes to reconstruct everything. # For the actual copying, we use a seek and write pattern of buffer # slices. final_target_size = None for (dbuf, offset, src_size, target_size), dstream in reversed(zip(buffer_info_list, self._dstreams)): # allocate a buffer to hold all delta data - fill in the data for # fast access. We do this as we know that reading individual bytes # from our stream would be slower than necessary ( although possible ) # The dbuf buffer contains commands after the first two MSB sizes, the # offset specifies the amount of bytes read to get the sizes. ddata = allocate_memory(dstream.size - offset) ddata.write(dbuf) # read the rest from the stream. The size we give is larger than necessary stream_copy(dstream.read, ddata.write, dstream.size, 256*mmap.PAGESIZE) ####################################################################### if 'c_apply_delta' in globals(): c_apply_delta(bbuf, ddata, tbuf); else: apply_delta_data(bbuf, src_size, ddata, len(ddata), tbuf.write) ####################################################################### # finally, swap out source and target buffers. The target is now the # base for the next delta to apply bbuf, tbuf = tbuf, bbuf bbuf.seek(0) tbuf.seek(0) final_target_size = target_size # END for each delta to apply # its already seeked to 0, constrain it to the actual size # NOTE: in the end of the loop, it swaps buffers, hence our target buffer # is not tbuf, but bbuf ! self._mm_target = bbuf self._size = final_target_size #{ Configuration if not has_perf_mod: _set_cache_ = _set_cache_brute_ else: _set_cache_ = _set_cache_too_slow_without_c #} END configuration def read(self, count=0): bl = self._size - self._br # bytes left if count < 1 or count > bl: count = bl # NOTE: we could check for certain size limits, and possibly # return buffers instead of strings to prevent byte copying data = self._mm_target.read(count) self._br += len(data) return data def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): """Allows to reset the stream to restart reading :raise ValueError: If offset and whence are not 0""" if offset != 0 or whence != getattr(os, 'SEEK_SET', 0): raise ValueError("Can only seek to position 0") # END handle offset self._br = 0 self._mm_target.seek(0) #{ Interface @classmethod def new(cls, stream_list): """ Convert the given list of streams into a stream which resolves deltas when reading from it. :param stream_list: two or more stream objects, first stream is a Delta to the object that you want to resolve, followed by N additional delta streams. The list's last stream must be a non-delta stream. :return: Non-Delta OPackStream object whose stream can be used to obtain the decompressed resolved data :raise ValueError: if the stream list cannot be handled""" if len(stream_list) < 2: raise ValueError("Need at least two streams") # END single object special handling if stream_list[-1].type_id in delta_types: raise ValueError("Cannot resolve deltas if there is no base object stream, last one was type: %s" % stream_list[-1].type) # END check stream return cls(stream_list) #} END interface #{ OInfo like Interface @property def type(self): return self._bstream.type @property def type_id(self): return self._bstream.type_id @property def size(self): """:return: number of uncompressed bytes in the stream""" return self._size #} END oinfo like interface #} END RO streams #{ W Streams class Sha1Writer(object): """Simple stream writer which produces a sha whenever you like as it degests everything it is supposed to write""" __slots__ = "sha1" def __init__(self): self.sha1 = make_sha() #{ Stream Interface def write(self, data): """:raise IOError: If not all bytes could be written :return: lenght of incoming data""" self.sha1.update(data) return len(data) # END stream interface #{ Interface def sha(self, as_hex = False): """:return: sha so far :param as_hex: if True, sha will be hex-encoded, binary otherwise""" if as_hex: return self.sha1.hexdigest() return self.sha1.digest() #} END interface class FlexibleSha1Writer(Sha1Writer): """Writer producing a sha1 while passing on the written bytes to the given write function""" __slots__ = 'writer' def __init__(self, writer): Sha1Writer.__init__(self) self.writer = writer def write(self, data): Sha1Writer.write(self, data) self.writer(data) class ZippedStoreShaWriter(Sha1Writer): """Remembers everything someone writes to it and generates a sha""" __slots__ = ('buf', 'zip') def __init__(self): Sha1Writer.__init__(self) self.buf = StringIO() self.zip = zlib.compressobj(zlib.Z_BEST_SPEED) def __getattr__(self, attr): return getattr(self.buf, attr) def write(self, data): alen = Sha1Writer.write(self, data) self.buf.write(self.zip.compress(data)) return alen def close(self): self.buf.write(self.zip.flush()) def seek(self, offset, whence=getattr(os, 'SEEK_SET', 0)): """Seeking currently only supports to rewind written data Multiple writes are not supported""" if offset != 0 or whence != getattr(os, 'SEEK_SET', 0): raise ValueError("Can only seek to position 0") # END handle offset self.buf.seek(0) def getvalue(self): """:return: string value from the current stream position to the end""" return self.buf.getvalue() class FDCompressedSha1Writer(Sha1Writer): """Digests data written to it, making the sha available, then compress the data and write it to the file descriptor **Note:** operates on raw file descriptors **Note:** for this to work, you have to use the close-method of this instance""" __slots__ = ("fd", "sha1", "zip") # default exception exc = IOError("Failed to write all bytes to filedescriptor") def __init__(self, fd): super(FDCompressedSha1Writer, self).__init__() self.fd = fd self.zip = zlib.compressobj(zlib.Z_BEST_SPEED) #{ Stream Interface def write(self, data): """:raise IOError: If not all bytes could be written :return: lenght of incoming data""" self.sha1.update(data) cdata = self.zip.compress(data) bytes_written = write(self.fd, cdata) if bytes_written != len(cdata): raise self.exc return len(data) def close(self): remainder = self.zip.flush() if write(self.fd, remainder) != len(remainder): raise self.exc return close(self.fd) #} END stream interface class FDStream(object): """A simple wrapper providing the most basic functions on a file descriptor with the fileobject interface. Cannot use os.fdopen as the resulting stream takes ownership""" __slots__ = ("_fd", '_pos') def __init__(self, fd): self._fd = fd self._pos = 0 def write(self, data): self._pos += len(data) os.write(self._fd, data) def read(self, count=0): if count == 0: count = os.path.getsize(self._filepath) # END handle read everything bytes = os.read(self._fd, count) self._pos += len(bytes) return bytes def fileno(self): return self._fd def tell(self): return self._pos def close(self): close(self._fd) class NullStream(object): """A stream that does nothing but providing a stream interface. Use it like /dev/null""" __slots__ = tuple() def read(self, size=0): return '' def close(self): pass def write(self, data): return len(data) #} END W streams gitdb-0.5.4/gitdb/typ.py0000644000175100017510000000063611573623553014155 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Module containing information about types known to the database""" #{ String types str_blob_type = "blob" str_commit_type = "commit" str_tree_type = "tree" str_tag_type = "tag" #} END string types gitdb-0.5.4/gitdb/pack.py0000644000175100017510000010121511604620300014230 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Contains PackIndexFile and PackFile implementations""" from gitdb.exc import ( BadObject, UnsupportedOperation, ParseError ) from util import ( zlib, mman, LazyMixin, unpack_from, bin_to_hex, ) from fun import ( create_pack_object_header, pack_object_header_info, is_equal_canonical_sha, type_id_to_type_map, write_object, stream_copy, chunk_size, delta_types, OFS_DELTA, REF_DELTA, msb_size ) try: from _perf import PackIndexFile_sha_to_index except ImportError: pass # END try c module from base import ( # Amazing ! OInfo, OStream, OPackInfo, OPackStream, ODeltaStream, ODeltaPackInfo, ODeltaPackStream, ) from stream import ( DecompressMemMapReader, DeltaApplyReader, Sha1Writer, NullStream, FlexibleSha1Writer ) from struct import ( pack, unpack, ) from binascii import crc32 from itertools import izip import tempfile import array import os import sys __all__ = ('PackIndexFile', 'PackFile', 'PackEntity') #{ Utilities def pack_object_at(cursor, offset, as_stream): """ :return: Tuple(abs_data_offset, PackInfo|PackStream) an object of the correct type according to the type_id of the object. If as_stream is True, the object will contain a stream, allowing the data to be read decompressed. :param data: random accessable data containing all required information :parma offset: offset in to the data at which the object information is located :param as_stream: if True, a stream object will be returned that can read the data, otherwise you receive an info object only""" data = cursor.use_region(offset).buffer() type_id, uncomp_size, data_rela_offset = pack_object_header_info(data) total_rela_offset = None # set later, actual offset until data stream begins delta_info = None # OFFSET DELTA if type_id == OFS_DELTA: i = data_rela_offset c = ord(data[i]) i += 1 delta_offset = c & 0x7f while c & 0x80: c = ord(data[i]) i += 1 delta_offset += 1 delta_offset = (delta_offset << 7) + (c & 0x7f) # END character loop delta_info = delta_offset total_rela_offset = i # REF DELTA elif type_id == REF_DELTA: total_rela_offset = data_rela_offset+20 delta_info = data[data_rela_offset:total_rela_offset] # BASE OBJECT else: # assume its a base object total_rela_offset = data_rela_offset # END handle type id abs_data_offset = offset + total_rela_offset if as_stream: stream = DecompressMemMapReader(buffer(data, total_rela_offset), False, uncomp_size) if delta_info is None: return abs_data_offset, OPackStream(offset, type_id, uncomp_size, stream) else: return abs_data_offset, ODeltaPackStream(offset, type_id, uncomp_size, delta_info, stream) else: if delta_info is None: return abs_data_offset, OPackInfo(offset, type_id, uncomp_size) else: return abs_data_offset, ODeltaPackInfo(offset, type_id, uncomp_size, delta_info) # END handle info # END handle stream def write_stream_to_pack(read, write, zstream, base_crc=None): """Copy a stream as read from read function, zip it, and write the result. Count the number of written bytes and return it :param base_crc: if not None, the crc will be the base for all compressed data we consecutively write and generate a crc32 from. If None, no crc will be generated :return: tuple(no bytes read, no bytes written, crc32) crc might be 0 if base_crc was false""" br = 0 # bytes read bw = 0 # bytes written want_crc = base_crc is not None crc = 0 if want_crc: crc = base_crc #END initialize crc while True: chunk = read(chunk_size) br += len(chunk) compressed = zstream.compress(chunk) bw += len(compressed) write(compressed) # cannot assume return value if want_crc: crc = crc32(compressed, crc) #END handle crc if len(chunk) != chunk_size: break #END copy loop compressed = zstream.flush() bw += len(compressed) write(compressed) if want_crc: crc = crc32(compressed, crc) #END handle crc return (br, bw, crc) #} END utilities class IndexWriter(object): """Utility to cache index information, allowing to write all information later in one go to the given stream **Note:** currently only writes v2 indices""" __slots__ = '_objs' def __init__(self): self._objs = list() def append(self, binsha, crc, offset): """Append one piece of object information""" self._objs.append((binsha, crc, offset)) def write(self, pack_sha, write): """Write the index file using the given write method :param pack_sha: binary sha over the whole pack that we index :return: sha1 binary sha over all index file contents""" # sort for sha1 hash self._objs.sort(key=lambda o: o[0]) sha_writer = FlexibleSha1Writer(write) sha_write = sha_writer.write sha_write(PackIndexFile.index_v2_signature) sha_write(pack(">L", PackIndexFile.index_version_default)) # fanout tmplist = list((0,)*256) # fanout or list with 64 bit offsets for t in self._objs: tmplist[ord(t[0][0])] += 1 #END prepare fanout for i in xrange(255): v = tmplist[i] sha_write(pack('>L', v)) tmplist[i+1] += v #END write each fanout entry sha_write(pack('>L', tmplist[255])) # sha1 ordered # save calls, that is push them into c sha_write(''.join(t[0] for t in self._objs)) # crc32 for t in self._objs: sha_write(pack('>L', t[1]&0xffffffff)) #END for each crc tmplist = list() # offset 32 for t in self._objs: ofs = t[2] if ofs > 0x7fffffff: tmplist.append(ofs) ofs = 0x80000000 + len(tmplist)-1 #END hande 64 bit offsets sha_write(pack('>L', ofs&0xffffffff)) #END for each offset # offset 64 for ofs in tmplist: sha_write(pack(">Q", ofs)) #END for each offset # trailer assert(len(pack_sha) == 20) sha_write(pack_sha) sha = sha_writer.sha(as_hex=False) write(sha) return sha class PackIndexFile(LazyMixin): """A pack index provides offsets into the corresponding pack, allowing to find locations for offsets faster.""" # Dont use slots as we dynamically bind functions for each version, need a dict for this # The slots you see here are just to keep track of our instance variables # __slots__ = ('_indexpath', '_fanout_table', '_cursor', '_version', # '_sha_list_offset', '_crc_list_offset', '_pack_offset', '_pack_64_offset') # used in v2 indices _sha_list_offset = 8 + 1024 index_v2_signature = '\377tOc' index_version_default = 2 def __init__(self, indexpath): super(PackIndexFile, self).__init__() self._indexpath = indexpath def _set_cache_(self, attr): if attr == "_packfile_checksum": self._packfile_checksum = self._cursor.map()[-40:-20] elif attr == "_packfile_checksum": self._packfile_checksum = self._cursor.map()[-20:] elif attr == "_cursor": # Note: We don't lock the file when reading as we cannot be sure # that we can actually write to the location - it could be a read-only # alternate for instance self._cursor = mman.make_cursor(self._indexpath).use_region() # We will assume that the index will always fully fit into memory ! if mman.window_size() > 0 and self._cursor.file_size() > mman.window_size(): raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % (self._indexpath, self._cursor.file_size(), mman.window_size())) #END assert window size else: # now its time to initialize everything - if we are here, someone wants # to access the fanout table or related properties # CHECK VERSION mmap = self._cursor.map() self._version = (mmap[:4] == self.index_v2_signature and 2) or 1 if self._version == 2: version_id = unpack_from(">L", mmap, 4)[0] assert version_id == self._version, "Unsupported index version: %i" % version_id # END assert version # SETUP FUNCTIONS # setup our functions according to the actual version for fname in ('entry', 'offset', 'sha', 'crc'): setattr(self, fname, getattr(self, "_%s_v%i" % (fname, self._version))) # END for each function to initialize # INITIALIZE DATA # byte offset is 8 if version is 2, 0 otherwise self._initialize() # END handle attributes #{ Access V1 def _entry_v1(self, i): """:return: tuple(offset, binsha, 0)""" return unpack_from(">L20s", self._cursor.map(), 1024 + i*24) + (0, ) def _offset_v1(self, i): """see ``_offset_v2``""" return unpack_from(">L", self._cursor.map(), 1024 + i*24)[0] def _sha_v1(self, i): """see ``_sha_v2``""" base = 1024 + (i*24)+4 return self._cursor.map()[base:base+20] def _crc_v1(self, i): """unsupported""" return 0 #} END access V1 #{ Access V2 def _entry_v2(self, i): """:return: tuple(offset, binsha, crc)""" return (self._offset_v2(i), self._sha_v2(i), self._crc_v2(i)) def _offset_v2(self, i): """:return: 32 or 64 byte offset into pack files. 64 byte offsets will only be returned if the pack is larger than 4 GiB, or 2^32""" offset = unpack_from(">L", self._cursor.map(), self._pack_offset + i * 4)[0] # if the high-bit is set, this indicates that we have to lookup the offset # in the 64 bit region of the file. The current offset ( lower 31 bits ) # are the index into it if offset & 0x80000000: offset = unpack_from(">Q", self._cursor.map(), self._pack_64_offset + (offset & ~0x80000000) * 8)[0] # END handle 64 bit offset return offset def _sha_v2(self, i): """:return: sha at the given index of this file index instance""" base = self._sha_list_offset + i * 20 return self._cursor.map()[base:base+20] def _crc_v2(self, i): """:return: 4 bytes crc for the object at index i""" return unpack_from(">L", self._cursor.map(), self._crc_list_offset + i * 4)[0] #} END access V2 #{ Initialization def _initialize(self): """initialize base data""" self._fanout_table = self._read_fanout((self._version == 2) * 8) if self._version == 2: self._crc_list_offset = self._sha_list_offset + self.size() * 20 self._pack_offset = self._crc_list_offset + self.size() * 4 self._pack_64_offset = self._pack_offset + self.size() * 4 # END setup base def _read_fanout(self, byte_offset): """Generate a fanout table from our data""" d = self._cursor.map() out = list() append = out.append for i in range(256): append(unpack_from('>L', d, byte_offset + i*4)[0]) # END for each entry return out #} END initialization #{ Properties def version(self): return self._version def size(self): """:return: amount of objects referred to by this index""" return self._fanout_table[255] def path(self): """:return: path to the packindexfile""" return self._indexpath def packfile_checksum(self): """:return: 20 byte sha representing the sha1 hash of the pack file""" return self._cursor.map()[-40:-20] def indexfile_checksum(self): """:return: 20 byte sha representing the sha1 hash of this index file""" return self._cursor.map()[-20:] def offsets(self): """:return: sequence of all offsets in the order in which they were written **Note:** return value can be random accessed, but may be immmutable""" if self._version == 2: # read stream to array, convert to tuple a = array.array('I') # 4 byte unsigned int, long are 8 byte on 64 bit it appears a.fromstring(buffer(self._cursor.map(), self._pack_offset, self._pack_64_offset - self._pack_offset)) # networkbyteorder to something array likes more if sys.byteorder == 'little': a.byteswap() return a else: return tuple(self.offset(index) for index in xrange(self.size())) # END handle version def sha_to_index(self, sha): """ :return: index usable with the ``offset`` or ``entry`` method, or None if the sha was not found in this pack index :param sha: 20 byte sha to lookup""" first_byte = ord(sha[0]) get_sha = self.sha lo = 0 # lower index, the left bound of the bisection if first_byte != 0: lo = self._fanout_table[first_byte-1] hi = self._fanout_table[first_byte] # the upper, right bound of the bisection # bisect until we have the sha while lo < hi: mid = (lo + hi) / 2 c = cmp(sha, get_sha(mid)) if c < 0: hi = mid elif not c: return mid else: lo = mid + 1 # END handle midpoint # END bisect return None def partial_sha_to_index(self, partial_bin_sha, canonical_length): """ :return: index as in `sha_to_index` or None if the sha was not found in this index file :param partial_bin_sha: an at least two bytes of a partial binary sha :param canonical_length: lenght of the original hexadecimal representation of the given partial binary sha :raise AmbiguousObjectName:""" if len(partial_bin_sha) < 2: raise ValueError("Require at least 2 bytes of partial sha") first_byte = ord(partial_bin_sha[0]) get_sha = self.sha lo = 0 # lower index, the left bound of the bisection if first_byte != 0: lo = self._fanout_table[first_byte-1] hi = self._fanout_table[first_byte] # the upper, right bound of the bisection # fill the partial to full 20 bytes filled_sha = partial_bin_sha + '\0'*(20 - len(partial_bin_sha)) # find lowest while lo < hi: mid = (lo + hi) / 2 c = cmp(filled_sha, get_sha(mid)) if c < 0: hi = mid elif not c: # perfect match lo = mid break else: lo = mid + 1 # END handle midpoint # END bisect if lo < self.size(): cur_sha = get_sha(lo) if is_equal_canonical_sha(canonical_length, partial_bin_sha, cur_sha): next_sha = None if lo+1 < self.size(): next_sha = get_sha(lo+1) if next_sha and next_sha == cur_sha: raise AmbiguousObjectName(partial_bin_sha) return lo # END if we have a match # END if we found something return None if 'PackIndexFile_sha_to_index' in globals(): # NOTE: Its just about 25% faster, the major bottleneck might be the attr # accesses def sha_to_index(self, sha): return PackIndexFile_sha_to_index(self, sha) # END redefine heavy-hitter with c version #} END properties class PackFile(LazyMixin): """A pack is a file written according to the Version 2 for git packs As we currently use memory maps, it could be assumed that the maximum size of packs therefor is 32 bit on 32 bit systems. On 64 bit systems, this should be fine though. **Note:** at some point, this might be implemented using streams as well, or streams are an alternate path in the case memory maps cannot be created for some reason - one clearly doesn't want to read 10GB at once in that case""" __slots__ = ('_packpath', '_cursor', '_size', '_version') pack_signature = 0x5041434b # 'PACK' pack_version_default = 2 # offset into our data at which the first object starts first_object_offset = 3*4 # header bytes footer_size = 20 # final sha def __init__(self, packpath): self._packpath = packpath def _set_cache_(self, attr): # we fill the whole cache, whichever attribute gets queried first self._cursor = mman.make_cursor(self._packpath).use_region() # read the header information type_id, self._version, self._size = unpack_from(">LLL", self._cursor.map(), 0) # TODO: figure out whether we should better keep the lock, or maybe # add a .keep file instead ? if type_id != self.pack_signature: raise ParseError("Invalid pack signature: %i" % type_id) def _iter_objects(self, start_offset, as_stream=True): """Handle the actual iteration of objects within this pack""" c = self._cursor content_size = c.file_size() - self.footer_size cur_offset = start_offset or self.first_object_offset null = NullStream() while cur_offset < content_size: data_offset, ostream = pack_object_at(c, cur_offset, True) # scrub the stream to the end - this decompresses the object, but yields # the amount of compressed bytes we need to get to the next offset stream_copy(ostream.read, null.write, ostream.size, chunk_size) cur_offset += (data_offset - ostream.pack_offset) + ostream.stream.compressed_bytes_read() # if a stream is requested, reset it beforehand # Otherwise return the Stream object directly, its derived from the # info object if as_stream: ostream.stream.seek(0) yield ostream # END until we have read everything #{ Pack Information def size(self): """:return: The amount of objects stored in this pack""" return self._size def version(self): """:return: the version of this pack""" return self._version def data(self): """ :return: read-only data of this pack. It provides random access and usually is a memory map. :note: This method is unsafe as it returns a window into a file which might be larger than than the actual window size""" # can use map as we are starting at offset 0. Otherwise we would have to use buffer() return self._cursor.use_region().map() def checksum(self): """:return: 20 byte sha1 hash on all object sha's contained in this file""" return self._cursor.use_region(self._cursor.file_size()-20).buffer()[:] def path(self): """:return: path to the packfile""" return self._packpath #} END pack information #{ Pack Specific def collect_streams(self, offset): """ :return: list of pack streams which are required to build the object at the given offset. The first entry of the list is the object at offset, the last one is either a full object, or a REF_Delta stream. The latter type needs its reference object to be locked up in an ODB to form a valid delta chain. If the object at offset is no delta, the size of the list is 1. :param offset: specifies the first byte of the object within this pack""" out = list() c = self._cursor while True: ostream = pack_object_at(c, offset, True)[1] out.append(ostream) if ostream.type_id == OFS_DELTA: offset = ostream.pack_offset - ostream.delta_info else: # the only thing we can lookup are OFFSET deltas. Everything # else is either an object, or a ref delta, in the latter # case someone else has to find it break # END handle type # END while chaining streams return out #} END pack specific #{ Read-Database like Interface def info(self, offset): """Retrieve information about the object at the given file-absolute offset :param offset: byte offset :return: OPackInfo instance, the actual type differs depending on the type_id attribute""" return pack_object_at(self._cursor, offset or self.first_object_offset, False)[1] def stream(self, offset): """Retrieve an object at the given file-relative offset as stream along with its information :param offset: byte offset :return: OPackStream instance, the actual type differs depending on the type_id attribute""" return pack_object_at(self._cursor, offset or self.first_object_offset, True)[1] def stream_iter(self, start_offset=0): """ :return: iterator yielding OPackStream compatible instances, allowing to access the data in the pack directly. :param start_offset: offset to the first object to iterate. If 0, iteration starts at the very first object in the pack. **Note:** Iterating a pack directly is costly as the datastream has to be decompressed to determine the bounds between the objects""" return self._iter_objects(start_offset, as_stream=True) #} END Read-Database like Interface class PackEntity(LazyMixin): """Combines the PackIndexFile and the PackFile into one, allowing the actual objects to be resolved and iterated""" __slots__ = ( '_index', # our index file '_pack', # our pack file '_offset_map' # on demand dict mapping one offset to the next consecutive one ) IndexFileCls = PackIndexFile PackFileCls = PackFile def __init__(self, pack_or_index_path): """Initialize ourselves with the path to the respective pack or index file""" basename, ext = os.path.splitext(pack_or_index_path) self._index = self.IndexFileCls("%s.idx" % basename) # PackIndexFile instance self._pack = self.PackFileCls("%s.pack" % basename) # corresponding PackFile instance def _set_cache_(self, attr): # currently this can only be _offset_map # TODO: make this a simple sorted offset array which can be bisected # to find the respective entry, from which we can take a +1 easily # This might be slower, but should also be much lighter in memory ! offsets_sorted = sorted(self._index.offsets()) last_offset = len(self._pack.data()) - self._pack.footer_size assert offsets_sorted, "Cannot handle empty indices" offset_map = None if len(offsets_sorted) == 1: offset_map = { offsets_sorted[0] : last_offset } else: iter_offsets = iter(offsets_sorted) iter_offsets_plus_one = iter(offsets_sorted) iter_offsets_plus_one.next() consecutive = izip(iter_offsets, iter_offsets_plus_one) offset_map = dict(consecutive) # the last offset is not yet set offset_map[offsets_sorted[-1]] = last_offset # END handle offset amount self._offset_map = offset_map def _sha_to_index(self, sha): """:return: index for the given sha, or raise""" index = self._index.sha_to_index(sha) if index is None: raise BadObject(sha) return index def _iter_objects(self, as_stream): """Iterate over all objects in our index and yield their OInfo or OStream instences""" _sha = self._index.sha _object = self._object for index in xrange(self._index.size()): yield _object(_sha(index), as_stream, index) # END for each index def _object(self, sha, as_stream, index=-1): """:return: OInfo or OStream object providing information about the given sha :param index: if not -1, its assumed to be the sha's index in the IndexFile""" # its a little bit redundant here, but it needs to be efficient if index < 0: index = self._sha_to_index(sha) if sha is None: sha = self._index.sha(index) # END assure sha is present ( in output ) offset = self._index.offset(index) type_id, uncomp_size, data_rela_offset = pack_object_header_info(self._pack._cursor.use_region(offset).buffer()) if as_stream: if type_id not in delta_types: packstream = self._pack.stream(offset) return OStream(sha, packstream.type, packstream.size, packstream.stream) # END handle non-deltas # produce a delta stream containing all info # To prevent it from applying the deltas when querying the size, # we extract it from the delta stream ourselves streams = self.collect_streams_at_offset(offset) dstream = DeltaApplyReader.new(streams) return ODeltaStream(sha, dstream.type, None, dstream) else: if type_id not in delta_types: return OInfo(sha, type_id_to_type_map[type_id], uncomp_size) # END handle non-deltas # deltas are a little tougher - unpack the first bytes to obtain # the actual target size, as opposed to the size of the delta data streams = self.collect_streams_at_offset(offset) buf = streams[0].read(512) offset, src_size = msb_size(buf) offset, target_size = msb_size(buf, offset) # collect the streams to obtain the actual object type if streams[-1].type_id in delta_types: raise BadObject(sha, "Could not resolve delta object") return OInfo(sha, streams[-1].type, target_size) # END handle stream #{ Read-Database like Interface def info(self, sha): """Retrieve information about the object identified by the given sha :param sha: 20 byte sha1 :raise BadObject: :return: OInfo instance, with 20 byte sha""" return self._object(sha, False) def stream(self, sha): """Retrieve an object stream along with its information as identified by the given sha :param sha: 20 byte sha1 :raise BadObject: :return: OStream instance, with 20 byte sha""" return self._object(sha, True) def info_at_index(self, index): """As ``info``, but uses a PackIndexFile compatible index to refer to the object""" return self._object(None, False, index) def stream_at_index(self, index): """As ``stream``, but uses a PackIndexFile compatible index to refer to the object""" return self._object(None, True, index) #} END Read-Database like Interface #{ Interface def pack(self): """:return: the underlying pack file instance""" return self._pack def index(self): """:return: the underlying pack index file instance""" return self._index def is_valid_stream(self, sha, use_crc=False): """ Verify that the stream at the given sha is valid. :param use_crc: if True, the index' crc is run over the compressed stream of the object, which is much faster than checking the sha1. It is also more prone to unnoticed corruption or manipulation. :param sha: 20 byte sha1 of the object whose stream to verify whether the compressed stream of the object is valid. If it is a delta, this only verifies that the delta's data is valid, not the data of the actual undeltified object, as it depends on more than just this stream. If False, the object will be decompressed and the sha generated. It must match the given sha :return: True if the stream is valid :raise UnsupportedOperation: If the index is version 1 only :raise BadObject: sha was not found""" if use_crc: if self._index.version() < 2: raise UnsupportedOperation("Version 1 indices do not contain crc's, verify by sha instead") # END handle index version index = self._sha_to_index(sha) offset = self._index.offset(index) next_offset = self._offset_map[offset] crc_value = self._index.crc(index) # create the current crc value, on the compressed object data # Read it in chunks, without copying the data crc_update = zlib.crc32 pack_data = self._pack.data() cur_pos = offset this_crc_value = 0 while cur_pos < next_offset: rbound = min(cur_pos + chunk_size, next_offset) size = rbound - cur_pos this_crc_value = crc_update(buffer(pack_data, cur_pos, size), this_crc_value) cur_pos += size # END window size loop # crc returns signed 32 bit numbers, the AND op forces it into unsigned # mode ... wow, sneaky, from dulwich. return (this_crc_value & 0xffffffff) == crc_value else: shawriter = Sha1Writer() stream = self._object(sha, as_stream=True) # write a loose object, which is the basis for the sha write_object(stream.type, stream.size, stream.read, shawriter.write) assert shawriter.sha(as_hex=False) == sha return shawriter.sha(as_hex=False) == sha # END handle crc/sha verification return True def info_iter(self): """ :return: Iterator over all objects in this pack. The iterator yields OInfo instances""" return self._iter_objects(as_stream=False) def stream_iter(self): """ :return: iterator over all objects in this pack. The iterator yields OStream instances""" return self._iter_objects(as_stream=True) def collect_streams_at_offset(self, offset): """ As the version in the PackFile, but can resolve REF deltas within this pack For more info, see ``collect_streams`` :param offset: offset into the pack file at which the object can be found""" streams = self._pack.collect_streams(offset) # try to resolve the last one if needed. It is assumed to be either # a REF delta, or a base object, as OFFSET deltas are resolved by the pack if streams[-1].type_id == REF_DELTA: stream = streams[-1] while stream.type_id in delta_types: if stream.type_id == REF_DELTA: sindex = self._index.sha_to_index(stream.delta_info) if sindex is None: break stream = self._pack.stream(self._index.offset(sindex)) streams.append(stream) else: # must be another OFS DELTA - this could happen if a REF # delta we resolve previously points to an OFS delta. Who # would do that ;) ? We can handle it though stream = self._pack.stream(stream.delta_info) streams.append(stream) # END handle ref delta # END resolve ref streams # END resolve streams return streams def collect_streams(self, sha): """ As ``PackFile.collect_streams``, but takes a sha instead of an offset. Additionally, ref_delta streams will be resolved within this pack. If this is not possible, the stream will be left alone, hence it is adivsed to check for unresolved ref-deltas and resolve them before attempting to construct a delta stream. :param sha: 20 byte sha1 specifying the object whose related streams you want to collect :return: list of streams, first being the actual object delta, the last being a possibly unresolved base object. :raise BadObject:""" return self.collect_streams_at_offset(self._index.offset(self._sha_to_index(sha))) @classmethod def write_pack(cls, object_iter, pack_write, index_write=None, object_count = None, zlib_compression = zlib.Z_BEST_SPEED): """ Create a new pack by putting all objects obtained by the object_iterator into a pack which is written using the pack_write method. The respective index is produced as well if index_write is not Non. :param object_iter: iterator yielding odb output objects :param pack_write: function to receive strings to write into the pack stream :param indx_write: if not None, the function writes the index file corresponding to the pack. :param object_count: if you can provide the amount of objects in your iteration, this would be the place to put it. Otherwise we have to pre-iterate and store all items into a list to get the number, which uses more memory than necessary. :param zlib_compression: the zlib compression level to use :return: tuple(pack_sha, index_binsha) binary sha over all the contents of the pack and over all contents of the index. If index_write was None, index_binsha will be None **Note:** The destination of the write functions is up to the user. It could be a socket, or a file for instance **Note:** writes only undeltified objects""" objs = object_iter if not object_count: if not isinstance(object_iter, (tuple, list)): objs = list(object_iter) #END handle list type object_count = len(objs) #END handle object pack_writer = FlexibleSha1Writer(pack_write) pwrite = pack_writer.write ofs = 0 # current offset into the pack file index = None wants_index = index_write is not None # write header pwrite(pack('>LLL', PackFile.pack_signature, PackFile.pack_version_default, object_count)) ofs += 12 if wants_index: index = IndexWriter() #END handle index header actual_count = 0 for obj in objs: actual_count += 1 crc = 0 # object header hdr = create_pack_object_header(obj.type_id, obj.size) if index_write: crc = crc32(hdr) else: crc = None #END handle crc pwrite(hdr) # data stream zstream = zlib.compressobj(zlib_compression) ostream = obj.stream br, bw, crc = write_stream_to_pack(ostream.read, pwrite, zstream, base_crc = crc) assert(br == obj.size) if wants_index: index.append(obj.binsha, crc, ofs) #END handle index ofs += len(hdr) + bw if actual_count == object_count: break #END abort once we are done #END for each object if actual_count != object_count: raise ValueError("Expected to write %i objects into pack, but received only %i from iterators" % (object_count, actual_count)) #END count assertion # write footer pack_sha = pack_writer.sha(as_hex = False) assert len(pack_sha) == 20 pack_write(pack_sha) ofs += len(pack_sha) # just for completeness ;) index_sha = None if wants_index: index_sha = index.write(pack_sha, index_write) #END handle index return pack_sha, index_sha @classmethod def create(cls, object_iter, base_dir, object_count = None, zlib_compression = zlib.Z_BEST_SPEED): """Create a new on-disk entity comprised of a properly named pack file and a properly named and corresponding index file. The pack contains all OStream objects contained in object iter. :param base_dir: directory which is to contain the files :return: PackEntity instance initialized with the new pack **Note:** for more information on the other parameters see the write_pack method""" pack_fd, pack_path = tempfile.mkstemp('', 'pack', base_dir) index_fd, index_path = tempfile.mkstemp('', 'index', base_dir) pack_write = lambda d: os.write(pack_fd, d) index_write = lambda d: os.write(index_fd, d) pack_binsha, index_binsha = cls.write_pack(object_iter, pack_write, index_write, object_count, zlib_compression) os.close(pack_fd) os.close(index_fd) fmt = "pack-%s.%s" new_pack_path = os.path.join(base_dir, fmt % (bin_to_hex(pack_binsha), 'pack')) new_index_path = os.path.join(base_dir, fmt % (bin_to_hex(pack_binsha), 'idx')) os.rename(pack_path, new_pack_path) os.rename(index_path, new_index_path) return cls(new_pack_path) #} END interface gitdb-0.5.4/gitdb/_delta_apply.c0000644000175100017510000006554111573623553015576 0ustar byronbyron#include <_delta_apply.h> #include #include #include #include #include typedef unsigned long long ull; typedef unsigned int uint; typedef unsigned char uchar; typedef unsigned short ushort; typedef uchar bool; // Constants const ull gDIV_grow_by = 100; // DELTA STREAM ACCESS /////////////////////// inline ull msb_size(const uchar** datap, const uchar* top) { const uchar *data = *datap; ull cmd, size = 0; uint i = 0; do { cmd = *data++; size |= (cmd & 0x7f) << i; i += 7; } while (cmd & 0x80 && data < top); *datap = data; return size; } // TOP LEVEL STREAM INFO ///////////////////////////// typedef struct { const uchar *tds; // Toplevel delta stream const uchar *cstart; // start of the chunks Py_ssize_t tdslen; // size of tds in bytes Py_ssize_t target_size; // size of the target buffer which can hold all data uint num_chunks; // amount of chunks in the delta stream PyObject *parent_object; } ToplevelStreamInfo; void TSI_init(ToplevelStreamInfo* info) { info->tds = NULL; info->cstart = NULL; info->tdslen = 0; info->num_chunks = 0; info->target_size = 0; info->parent_object = 0; } void TSI_destroy(ToplevelStreamInfo* info) { #ifdef DEBUG fprintf(stderr, "TSI_destroy: %p\n", info); #endif if (info->parent_object){ Py_DECREF(info->parent_object); info->parent_object = NULL; } else if (info->tds){ PyMem_Free((void*)info->tds); } info->tds = NULL; info->cstart = NULL; info->tdslen = 0; info->num_chunks = 0; } inline const uchar* TSI_end(ToplevelStreamInfo* info) { return info->tds + info->tdslen; } inline const uchar* TSI_first(ToplevelStreamInfo* info) { return info->cstart; } // set the stream, and initialize it // initialize our set stream to point to the first chunk // Fill in the header information, which is the base and target size inline void TSI_set_stream(ToplevelStreamInfo* info, const uchar* stream) { info->tds = stream; info->cstart = stream; assert(info->tds && info->tdslen); // init stream const uchar* tdsend = TSI_end(info); msb_size(&info->cstart, tdsend); // base size info->target_size = msb_size(&info->cstart, tdsend); } // duplicate the data currently owned by the parent object drop its refcount // return 1 on success bool TSI_copy_stream_from_object(ToplevelStreamInfo* info) { assert(info->parent_object); uchar* ptmp = PyMem_Malloc(info->tdslen); if (!ptmp){ return 0; } uint ofs = (uint)(info->cstart - info->tds); memcpy((void*)ptmp, info->tds, info->tdslen); info->tds = ptmp; info->cstart = ptmp + ofs; Py_DECREF(info->parent_object); info->parent_object = 0; return 1; } // Transfer ownership of the given stream into our instance. The amount of chunks // remains the same, and needs to be set by the caller void TSI_replace_stream(ToplevelStreamInfo* info, const uchar* stream, uint streamlen) { assert(info->parent_object == 0); uint ofs = (uint)(info->cstart - info->tds); if (info->tds){ PyMem_Free((void*)info->tds); } info->tds = stream; info->cstart = info->tds + ofs; info->tdslen = streamlen; } // DELTA CHUNK //////////////// // Internal Delta Chunk Objects // They are just used to keep information parsed from a stream // The data pointer is always shared typedef struct { ull to; uint ts; uint so; const uchar* data; } DeltaChunk; // forward declarations const uchar* next_delta_info(const uchar*, DeltaChunk*); inline void DC_init(DeltaChunk* dc, ull to, ull ts, ull so, const uchar* data) { dc->to = to; dc->ts = ts; dc->so = so; dc->data = NULL; } inline ull DC_rbound(const DeltaChunk* dc) { return dc->to + dc->ts; } inline void DC_print(const DeltaChunk* dc, const char* prefix) { fprintf(stderr, "%s-dc: to = %i, ts = %i, so = %i, data = %p\n", prefix, (int)dc->to, dc->ts, dc->so, dc->data); } // Apply inline void DC_apply(const DeltaChunk* dc, const uchar* base, PyObject* writer, PyObject* tmpargs) { PyObject* buffer = 0; if (dc->data){ buffer = PyBuffer_FromMemory((void*)dc->data, dc->ts); } else { buffer = PyBuffer_FromMemory((void*)(base + dc->so), dc->ts); } if (PyTuple_SetItem(tmpargs, 0, buffer)){ assert(0); } // tuple steals reference, and will take care about the deallocation PyObject_Call(writer, tmpargs, NULL); } // Encode the information in the given delta chunk and write the byte-stream // into the given output stream // It will be copied into the given bounds, the given size must be the final size // and work with the given relative offset - hence the bounds are assumed to be // correct and to fit within the unaltered dc inline void DC_encode_to(const DeltaChunk* dc, uchar** pout, uint ofs, uint size) { uchar* out = *pout; if (dc->data){ *out++ = (uchar)size; memcpy(out, dc->data+ofs, size); out += size; } else { uchar i = 0x80; uchar* op = out++; uint moff = dc->so+ofs; if (moff & 0x000000ff) *out++ = moff >> 0, i |= 0x01; if (moff & 0x0000ff00) *out++ = moff >> 8, i |= 0x02; if (moff & 0x00ff0000) *out++ = moff >> 16, i |= 0x04; if (moff & 0xff000000) *out++ = moff >> 24, i |= 0x08; if (size & 0x00ff) *out++ = size >> 0, i |= 0x10; if (size & 0xff00) *out++ = size >> 8, i |= 0x20; *op = i; } *pout = out; } // Return: amount of bytes one would need to encode dc inline ushort DC_count_encode_bytes(const DeltaChunk* dc) { if (dc->data){ return 1 + dc->ts; // cmd byte + actual data bytes } else { ushort c = 1; // cmd byte uint ts = dc->ts; ull so = dc->so; // offset c += (so & 0x000000FF) > 0; c += (so & 0x0000FF00) > 0; c += (so & 0x00FF0000) > 0; c += (so & 0xFF000000) > 0; // size - max size is 0x10000, its encoded with 0 size bits c += (ts & 0x000000FF) > 0; c += (ts & 0x0000FF00) > 0; return c; } } // DELTA INFO ///////////// typedef struct { uint dso; // delta stream offset, relative to the very start of the stream uint to; // target offset (cache) } DeltaInfo; // DELTA INFO VECTOR ////////////////////// typedef struct { DeltaInfo *mem; // Memory for delta infos uint di_last_size; // size of the last element - we can't compute it using the next bound const uchar *dstream; // borrowed ointer to delta stream we index Py_ssize_t size; // Amount of DeltaInfos Py_ssize_t reserved_size; // Reserved amount of DeltaInfos } DeltaInfoVector; // Reserve enough memory to hold the given amount of delta chunks // Return 1 on success // NOTE: added a minimum allocation to assure reallocation is not done // just for a single additional entry. DIVs change often, and reallocs are expensive inline int DIV_reserve_memory(DeltaInfoVector* vec, uint num_dc) { if (num_dc <= vec->reserved_size){ return 1; } #ifdef DEBUG bool was_null = vec->mem == NULL; #endif if (vec->mem == NULL){ vec->mem = PyMem_Malloc(num_dc * sizeof(DeltaInfo)); } else { vec->mem = PyMem_Realloc(vec->mem, num_dc * sizeof(DeltaInfo)); } if (vec->mem == NULL){ Py_FatalError("Could not allocate memory for append operation"); } vec->reserved_size = num_dc; #ifdef DEBUG const char* format = "Allocated %i bytes at %p, to hold up to %i chunks\n"; if (!was_null) format = "Re-allocated %i bytes at %p, to hold up to %i chunks\n"; fprintf(stderr, format, (int)(vec->reserved_size * sizeof(DeltaInfo)), vec->mem, (int)vec->reserved_size); #endif return vec->mem != NULL; } /* Grow the delta chunk list by the given amount of bytes. This may trigger a realloc, but will do nothing if the reserved size is already large enough. Return 1 on success, 0 on failure */ inline int DIV_grow_by(DeltaInfoVector* vec, uint num_dc) { return DIV_reserve_memory(vec, vec->reserved_size + num_dc); } int DIV_init(DeltaInfoVector* vec, ull initial_size) { vec->mem = NULL; vec->dstream = NULL; vec->size = 0; vec->reserved_size = 0; vec->di_last_size = 0; return DIV_grow_by(vec, initial_size); } inline Py_ssize_t DIV_len(const DeltaInfoVector* vec) { return vec->size; } inline uint DIV_lbound(const DeltaInfoVector* vec) { assert(vec->size && vec->mem); return vec->mem->to; } // Return item at index inline DeltaInfo* DIV_get(const DeltaInfoVector* vec, Py_ssize_t i) { assert(i < vec->size && vec->mem); return &vec->mem[i]; } // Return last item inline DeltaInfo* DIV_last(const DeltaInfoVector* vec) { return DIV_get(vec, vec->size-1); } inline int DIV_empty(const DeltaInfoVector* vec) { return vec->size == 0; } // Return end pointer of the vector inline const DeltaInfo* DIV_end(const DeltaInfoVector* vec) { assert(!DIV_empty(vec)); return vec->mem + vec->size; } // return first item in vector inline DeltaInfo* DIV_first(const DeltaInfoVector* vec) { assert(!DIV_empty(vec)); return vec->mem; } // return rbound offset in bytes. We use information contained in the // vec to do that inline uint DIV_info_rbound(const DeltaInfoVector* vec, const DeltaInfo* di) { if (DIV_last(vec) == di){ return di->to + vec->di_last_size; } else { return (di+1)->to; } } // return size of the given delta info item inline uint DIV_info_size2(const DeltaInfoVector* vec, const DeltaInfo* di, const DeltaInfo const* veclast) { if (veclast == di){ return vec->di_last_size; } else { return (di+1)->to - di->to; } } // return size of the given delta info item inline uint DIV_info_size(const DeltaInfoVector* vec, const DeltaInfo* di) { return DIV_info_size2(vec, di, DIV_last(vec)); } void DIV_destroy(DeltaInfoVector* vec) { if (vec->mem){ #ifdef DEBUG fprintf(stderr, "DIV_destroy: %p\n", (void*)vec->mem); #endif PyMem_Free(vec->mem); vec->size = 0; vec->reserved_size = 0; vec->mem = 0; } } // Reset this vector so that its existing memory can be filled again. // Memory will be kept, but not cleaned up inline void DIV_forget_members(DeltaInfoVector* vec) { vec->size = 0; } // Reset the vector so that its size will be zero // It will keep its memory though, and hence can be filled again inline void DIV_reset(DeltaInfoVector* vec) { if (vec->size == 0) return; vec->size = 0; } // Append one chunk to the end of the list, and return a pointer to it // It will not have been initialized ! inline DeltaInfo* DIV_append(DeltaInfoVector* vec) { if (vec->size + 1 > vec->reserved_size){ DIV_grow_by(vec, gDIV_grow_by); } DeltaInfo* next = vec->mem + vec->size; vec->size += 1; return next; } // Return delta chunk being closest to the given absolute offset inline DeltaInfo* DIV_closest_chunk(const DeltaInfoVector* vec, ull ofs) { assert(vec->mem); ull lo = 0; ull hi = vec->size; ull mid; DeltaInfo* di; while (lo < hi) { mid = (lo + hi) / 2; di = vec->mem + mid; if (di->to > ofs){ hi = mid; } else if ((DIV_info_rbound(vec, di) > ofs) | (di->to == ofs)) { return di; } else { lo = mid + 1; } } return DIV_last(vec); } // Return the amount of chunks a slice at the given spot would have, as well as // its size in bytes it would have if the possibly partial chunks would be encoded // and added to the spot marked by sdc inline uint DIV_count_slice_bytes(const DeltaInfoVector* src, uint ofs, uint size) { uint num_bytes = 0; DeltaInfo* cdi = DIV_closest_chunk(src, ofs); DeltaChunk dc; DC_init(&dc, 0, 0, 0, NULL); // partial overlap if (cdi->to != ofs) { const ull relofs = ofs - cdi->to; const uint cdisize = DIV_info_size(src, cdi); const uint max_size = cdisize - relofs < size ? cdisize - relofs : size; size -= max_size; // get the size in bytes the info would have next_delta_info(src->dstream + cdi->dso, &dc); dc.so += relofs; dc.ts = max_size; num_bytes += DC_count_encode_bytes(&dc); cdi += 1; if (size == 0){ return num_bytes; } } const DeltaInfo const* vecend = DIV_end(src); const uchar* nstream; for( ;cdi < vecend; ++cdi){ nstream = next_delta_info(src->dstream + cdi->dso, &dc); if (dc.ts < size) { num_bytes += nstream - (src->dstream + cdi->dso); size -= dc.ts; } else { dc.ts = size; num_bytes += DC_count_encode_bytes(&dc); size = 0; break; } } assert(size == 0); return num_bytes; } // Write a slice as defined by its absolute offset in bytes and its size into the given // destination memory. The individual chunks written will be a byte copy of the source // data chunk stream // Return: number of chunks in the slice inline uint DIV_copy_slice_to(const DeltaInfoVector* src, uchar** dest, ull tofs, uint size) { assert(DIV_lbound(src) <= tofs); assert((tofs + size) <= DIV_info_rbound(src, DIV_last(src))); DeltaChunk dc; DC_init(&dc, 0, 0, 0, NULL); DeltaInfo* cdi = DIV_closest_chunk(src, tofs); uint num_chunks = 0; // partial overlap if (cdi->to != tofs) { const uint relofs = tofs - cdi->to; next_delta_info(src->dstream + cdi->dso, &dc); const uint max_size = dc.ts - relofs < size ? dc.ts - relofs : size; size -= max_size; // adjust dc proportions DC_encode_to(&dc, dest, relofs, max_size); num_chunks += 1; cdi += 1; if (size == 0){ return num_chunks; } } const uchar* dstream = src->dstream + cdi->dso; const uchar* nstream = dstream; for( ; nstream; dstream = nstream) { num_chunks += 1; nstream = next_delta_info(dstream, &dc); if (dc.ts < size) { memcpy(*dest, dstream, nstream - dstream); *dest += nstream - dstream; size -= dc.ts; } else { DC_encode_to(&dc, dest, 0, size); size = 0; break; } } assert(size == 0); return num_chunks; } // Take slices of div into the corresponding area of the tsi, which is the topmost // delta to apply. bool DIV_connect_with_base(ToplevelStreamInfo* tsi, DeltaInfoVector* div) { assert(tsi->num_chunks); uint num_bytes = 0; const uchar* data = TSI_first(tsi); const uchar* dend = TSI_end(tsi); DeltaChunk dc; DC_init(&dc, 0, 0, 0, NULL); // COMPUTE SIZE OF TARGET STREAM ///////////////////////////////// for (;data < dend;) { data = next_delta_info(data, &dc); // Data chunks don't need processing if (dc.data){ num_bytes += 1 + dc.ts; continue; } num_bytes += DIV_count_slice_bytes(div, dc.so, dc.ts); } assert(DC_rbound(&dc) == tsi->target_size); // GET NEW DELTA BUFFER //////////////////////// uchar *const dstream = PyMem_Malloc(num_bytes); if (!dstream){ return 0; } data = TSI_first(tsi); const uchar *ndata = data; dend = TSI_end(tsi); uint num_chunks = 0; uchar* ds = dstream; DC_init(&dc, 0, 0, 0, NULL); // pick slices from the delta and put them into the new stream for (; data < dend; data = ndata) { ndata = next_delta_info(data, &dc); // Data chunks don't need processing if (dc.data){ // just copy it over memcpy((void*)ds, (void*)data, ndata - data); ds += ndata - data; num_chunks += 1; continue; } // Copy Chunks num_chunks += DIV_copy_slice_to(div, &ds, dc.so, dc.ts); } assert(ds - dstream == num_bytes); assert(num_chunks >= tsi->num_chunks); assert(DC_rbound(&dc) == tsi->target_size); // finally, replace the streams TSI_replace_stream(tsi, dstream, num_bytes); tsi->cstart = dstream; // we have NO header ! assert(tsi->tds == dstream); tsi->num_chunks = num_chunks; return 1; } // DELTA CHUNK LIST (PYTHON) ///////////////////////////// // Internally, it has nothing to do with a ChunkList anymore though typedef struct { PyObject_HEAD // ----------- ToplevelStreamInfo istream; } DeltaChunkList; int DCL_init(DeltaChunkList*self, PyObject *args, PyObject *kwds) { if(args && PySequence_Size(args) > 0){ PyErr_SetString(PyExc_ValueError, "Too many arguments"); return -1; } TSI_init(&self->istream); return 0; } void DCL_dealloc(DeltaChunkList* self) { TSI_destroy(&(self->istream)); } PyObject* DCL_py_rbound(DeltaChunkList* self) { return PyLong_FromUnsignedLongLong(self->istream.target_size); } // Write using a write function, taking remaining bytes from a base buffer PyObject* DCL_apply(DeltaChunkList* self, PyObject* args) { PyObject* pybuf = 0; PyObject* writeproc = 0; if (!PyArg_ParseTuple(args, "OO", &pybuf, &writeproc)){ PyErr_BadArgument(); return NULL; } if (!PyObject_CheckReadBuffer(pybuf)){ PyErr_SetString(PyExc_ValueError, "First argument must be a buffer-compatible object, like a string, or a memory map"); return NULL; } if (!PyCallable_Check(writeproc)){ PyErr_SetString(PyExc_ValueError, "Second argument must be a writer method with signature write(buf)"); return NULL; } const uchar* base; Py_ssize_t baselen; PyObject_AsReadBuffer(pybuf, (const void**)&base, &baselen); PyObject* tmpargs = PyTuple_New(1); const uchar* data = TSI_first(&self->istream); const uchar const* dend = TSI_end(&self->istream); DeltaChunk dc; DC_init(&dc, 0, 0, 0, NULL); while (data < dend){ data = next_delta_info(data, &dc); DC_apply(&dc, base, writeproc, tmpargs); } Py_DECREF(tmpargs); Py_RETURN_NONE; } PyMethodDef DCL_methods[] = { {"apply", (PyCFunction)DCL_apply, METH_VARARGS, "Apply the given iterable of delta streams" }, {"rbound", (PyCFunction)DCL_py_rbound, METH_NOARGS, NULL}, {NULL} /* Sentinel */ }; PyTypeObject DeltaChunkListType = { PyObject_HEAD_INIT(NULL) 0, /*ob_size*/ "DeltaChunkList", /*tp_name*/ sizeof(DeltaChunkList), /*tp_basicsize*/ 0, /*tp_itemsize*/ (destructor)DCL_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ 0, /*tp_compare*/ 0, /*tp_repr*/ 0, /*tp_as_number*/ 0, /*tp_as_sequence*/ 0, /*tp_as_mapping*/ 0, /*tp_hash */ 0, /*tp_call*/ 0, /*tp_str*/ 0, /*tp_getattro*/ 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT, /*tp_flags*/ "Minimal Delta Chunk List",/* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ DCL_methods, /* tp_methods */ 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ (initproc)DCL_init, /* tp_init */ 0, /* tp_alloc */ 0, /* tp_new */ }; // Makes a new copy of the DeltaChunkList - you have to do everything yourselve // in C ... want C++ !! DeltaChunkList* DCL_new_instance(void) { DeltaChunkList* dcl = (DeltaChunkList*) PyType_GenericNew(&DeltaChunkListType, 0, 0); assert(dcl); DCL_init(dcl, 0, 0); return dcl; } // Read the next delta chunk from the given stream and advance it // dc will contain the parsed information, its offset must be set by // the previous call of next_delta_info, which implies it should remain the // same instance between the calls. // Return the altered uchar pointer, reassign it to the input data inline const uchar* next_delta_info(const uchar* data, DeltaChunk* dc) { const char cmd = *data++; if (cmd & 0x80) { uint cp_off = 0, cp_size = 0; if (cmd & 0x01) cp_off = *data++; if (cmd & 0x02) cp_off |= (*data++ << 8); if (cmd & 0x04) cp_off |= (*data++ << 16); if (cmd & 0x08) cp_off |= ((unsigned) *data++ << 24); if (cmd & 0x10) cp_size = *data++; if (cmd & 0x20) cp_size |= (*data++ << 8); if (cmd & 0x40) cp_size |= (*data++ << 16); // this should never get hit with current deltas ... if (cp_size == 0) cp_size = 0x10000; dc->to += dc->ts; dc->data = NULL; dc->so = cp_off; dc->ts = cp_size; } else if (cmd) { // Just share the data dc->to += dc->ts; dc->data = data; dc->ts = cmd; dc->so = 0; data += cmd; } else { PyErr_SetString(PyExc_RuntimeError, "Encountered an unsupported delta cmd: 0"); assert(0); return NULL; } return data; } // Return amount of chunks encoded in the given delta stream // If read_header is True, then the header msb chunks will be read first. // Otherwise, the stream is assumed to be scrubbed one past the header uint compute_chunk_count(const uchar* data, const uchar* dend, bool read_header) { // read header if (read_header){ msb_size(&data, dend); msb_size(&data, dend); } DeltaChunk dc; DC_init(&dc, 0, 0, 0, NULL); uint num_chunks = 0; while (data < dend) { data = next_delta_info(data, &dc); num_chunks += 1; }// END handle command opcodes return num_chunks; } PyObject* connect_deltas(PyObject *self, PyObject *dstreams) { // obtain iterator PyObject* stream_iter = 0; if (!PyIter_Check(dstreams)){ stream_iter = PyObject_GetIter(dstreams); if (!stream_iter){ PyErr_SetString(PyExc_RuntimeError, "Couldn't obtain iterator for streams"); return NULL; } } else { stream_iter = dstreams; } DeltaInfoVector div; ToplevelStreamInfo tdsinfo; TSI_init(&tdsinfo); DIV_init(&div, 0); // GET TOPLEVEL DELTA STREAM int error = 0; PyObject* ds = 0; unsigned int dsi = 0; // delta stream index we process ds = PyIter_Next(stream_iter); if (!ds){ error = 1; goto _error; } dsi += 1; tdsinfo.parent_object = PyObject_CallMethod(ds, "read", 0); if (!PyObject_CheckReadBuffer(tdsinfo.parent_object)){ Py_DECREF(ds); error = 1; goto _error; } PyObject_AsReadBuffer(tdsinfo.parent_object, (const void**)&tdsinfo.tds, &tdsinfo.tdslen); if (tdsinfo.tdslen > pow(2, 32)){ // parent object is deallocated by info structure Py_DECREF(ds); PyErr_SetString(PyExc_RuntimeError, "Cannot handle deltas larger than 4GB"); tdsinfo.parent_object = 0; error = 1; goto _error; } Py_DECREF(ds); // let it officially know, and initialize its internal state TSI_set_stream(&tdsinfo, tdsinfo.tds); // INTEGRATE ANCESTOR DELTA STREAMS for (ds = PyIter_Next(stream_iter); ds != NULL; ds = PyIter_Next(stream_iter), ++dsi) { // Its important to initialize this before the next block which can jump // to code who needs this to exist ! PyObject* db = 0; // When processing the first delta, we know we will have to alter the tds // Hence we copy it and deallocate the parent object if (dsi == 1) { if (!TSI_copy_stream_from_object(&tdsinfo)){ PyErr_SetString(PyExc_RuntimeError, "Could not allocate memory to copy toplevel buffer"); // info structure takes care of the parent_object error = 1; goto loop_end; } tdsinfo.num_chunks = compute_chunk_count(tdsinfo.cstart, TSI_end(&tdsinfo), 0); } db = PyObject_CallMethod(ds, "read", 0); if (!PyObject_CheckReadBuffer(db)){ error = 1; PyErr_SetString(PyExc_RuntimeError, "Returned buffer didn't support the buffer protocol"); goto loop_end; } // Fill the stream info structure const uchar* data; Py_ssize_t dlen; PyObject_AsReadBuffer(db, (const void**)&data, &dlen); const uchar const* dstart = data; const uchar const* dend = data + dlen; div.dstream = dstart; if (dlen > pow(2, 32)){ error = 1; PyErr_SetString(PyExc_RuntimeError, "Cannot currently handle deltas larger than 4GB"); goto loop_end; } // READ HEADER msb_size(&data, dend); const ull target_size = msb_size(&data, dend); DIV_reserve_memory(&div, compute_chunk_count(data, dend, 0)); // parse command stream DeltaInfo* di = 0; // temporary pointer DeltaChunk dc; DC_init(&dc, 0, 0, 0, NULL); assert(data < dend); while (data < dend) { di = DIV_append(&div); di->dso = data - dstart; if ((data = next_delta_info(data, &dc))){ di->to = dc.to; } else { error = 1; goto loop_end; } }// END handle command opcodes // finalize information div.di_last_size = dc.ts; if (DC_rbound(&dc) != target_size){ PyErr_SetString(PyExc_RuntimeError, "Failed to parse delta stream"); error = 1; } #ifdef DEBUG fprintf(stderr, "------------ Stream %i --------\n ", (int)dsi); fprintf(stderr, "Before Connect: tdsinfo: num_chunks = %i, bytelen = %i KiB, target_size = %i KiB\n", (int)tdsinfo.num_chunks, (int)tdsinfo.tdslen/1000, (int)tdsinfo.target_size/1000); fprintf(stderr, "div->num_chunks = %i, div->reserved_size = %i, div->bytelen=%i KiB\n", (int)div.size, (int)div.reserved_size, (int)dlen/1000); #endif if (!DIV_connect_with_base(&tdsinfo, &div)){ error = 1; } #ifdef DEBUG fprintf(stderr, "after connect: tdsinfo->num_chunks = %i, tdsinfo->bytelen = %i KiB\n", (int)tdsinfo.num_chunks, (int)tdsinfo.tdslen/1000); #endif // destroy members, but keep memory DIV_reset(&div); loop_end: // perform cleanup Py_DECREF(ds); Py_DECREF(db); if (error){ break; } }// END for each stream object if (dsi == 0){ PyErr_SetString(PyExc_ValueError, "No streams provided"); } _error: if (stream_iter != dstreams){ Py_DECREF(stream_iter); } DIV_destroy(&div); // Return the actual python object - its just a container DeltaChunkList* dcl = DCL_new_instance(); if (!dcl){ PyErr_SetString(PyExc_RuntimeError, "Couldn't allocate list"); // Otherwise tdsinfo would be deallocated by the chunk list TSI_destroy(&tdsinfo); error = 1; } else { // Plain copy, transfer ownership to dcl dcl->istream = tdsinfo; } if (error){ // Will dealloc tdcv Py_XDECREF(dcl); return NULL; } return (PyObject*)dcl; } // Write using a write function, taking remaining bytes from a base buffer // replaces the corresponding method in python PyObject* apply_delta(PyObject* self, PyObject* args) { PyObject* pybbuf = 0; PyObject* pydbuf = 0; PyObject* pytbuf = 0; if (!PyArg_ParseTuple(args, "OOO", &pybbuf, &pydbuf, &pytbuf)){ PyErr_BadArgument(); return NULL; } PyObject* objects[] = { pybbuf, pydbuf, pytbuf }; assert(sizeof(objects) / sizeof(PyObject*) == 3); uint i; for(i = 0; i < 3; i++){ if (!PyObject_CheckReadBuffer(objects[i])){ PyErr_SetString(PyExc_ValueError, "Argument must be a buffer-compatible object, like a string, or a memory map"); return NULL; } } Py_ssize_t lbbuf; Py_ssize_t ldbuf; Py_ssize_t ltbuf; const uchar* bbuf; const uchar* dbuf; uchar* tbuf; PyObject_AsReadBuffer(pybbuf, (const void**)(&bbuf), &lbbuf); PyObject_AsReadBuffer(pydbuf, (const void**)(&dbuf), &ldbuf); if (PyObject_AsWriteBuffer(pytbuf, (void**)(&tbuf), <buf)){ PyErr_SetString(PyExc_ValueError, "Argument 3 must be a writable buffer"); return NULL; } const uchar* data = dbuf; const uchar* dend = dbuf + ldbuf; while (data < dend) { const char cmd = *data++; if (cmd & 0x80) { unsigned long cp_off = 0, cp_size = 0; if (cmd & 0x01) cp_off = *data++; if (cmd & 0x02) cp_off |= (*data++ << 8); if (cmd & 0x04) cp_off |= (*data++ << 16); if (cmd & 0x08) cp_off |= ((unsigned) *data++ << 24); if (cmd & 0x10) cp_size = *data++; if (cmd & 0x20) cp_size |= (*data++ << 8); if (cmd & 0x40) cp_size |= (*data++ << 16); if (cp_size == 0) cp_size = 0x10000; memcpy(tbuf, bbuf + cp_off, cp_size); tbuf += cp_size; } else if (cmd) { memcpy(tbuf, data, cmd); tbuf += cmd; data += cmd; } else { PyErr_SetString(PyExc_RuntimeError, "Encountered an unsupported delta cmd: 0"); return NULL; } }// END handle command opcodes Py_RETURN_NONE; } gitdb-0.5.4/gitdb/db/0000755000175100017510000000000011604623714013341 5ustar byronbyrongitdb-0.5.4/gitdb/db/base.py0000644000175100017510000002373211575506132014635 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Contains implementations of database retrieveing objects""" from gitdb.util import ( pool, join, LazyMixin, hex_to_bin ) from gitdb.exc import ( BadObject, AmbiguousObjectName ) from async import ( ChannelThreadTask ) from itertools import chain __all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'CompoundDB', 'CachingDB') class ObjectDBR(object): """Defines an interface for object database lookup. Objects are identified either by their 20 byte bin sha""" def __contains__(self, sha): return self.has_obj #{ Query Interface def has_object(self, sha): """ :return: True if the object identified by the given 20 bytes binary sha is contained in the database""" raise NotImplementedError("To be implemented in subclass") def has_object_async(self, reader): """Return a reader yielding information about the membership of objects as identified by shas :param reader: Reader yielding 20 byte shas. :return: async.Reader yielding tuples of (sha, bool) pairs which indicate whether the given sha exists in the database or not""" task = ChannelThreadTask(reader, str(self.has_object_async), lambda sha: (sha, self.has_object(sha))) return pool.add_task(task) def info(self, sha): """ :return: OInfo instance :param sha: bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") def info_async(self, reader): """Retrieve information of a multitude of objects asynchronously :param reader: Channel yielding the sha's of the objects of interest :return: async.Reader yielding OInfo|InvalidOInfo, in any order""" task = ChannelThreadTask(reader, str(self.info_async), self.info) return pool.add_task(task) def stream(self, sha): """:return: OStream instance :param sha: 20 bytes binary sha :raise BadObject:""" raise NotImplementedError("To be implemented in subclass") def stream_async(self, reader): """Retrieve the OStream of multiple objects :param reader: see ``info`` :param max_threads: see ``ObjectDBW.store`` :return: async.Reader yielding OStream|InvalidOStream instances in any order **Note:** depending on the system configuration, it might not be possible to read all OStreams at once. Instead, read them individually using reader.read(x) where x is small enough.""" # base implementation just uses the stream method repeatedly task = ChannelThreadTask(reader, str(self.stream_async), self.stream) return pool.add_task(task) def size(self): """:return: amount of objects in this database""" raise NotImplementedError() def sha_iter(self): """Return iterator yielding 20 byte shas for all objects in this data base""" raise NotImplementedError() #} END query interface class ObjectDBW(object): """Defines an interface to create objects in the database""" def __init__(self, *args, **kwargs): self._ostream = None #{ Edit Interface def set_ostream(self, stream): """ Adjusts the stream to which all data should be sent when storing new objects :param stream: if not None, the stream to use, if None the default stream will be used. :return: previously installed stream, or None if there was no override :raise TypeError: if the stream doesn't have the supported functionality""" cstream = self._ostream self._ostream = stream return cstream def ostream(self): """ :return: overridden output stream this instance will write to, or None if it will write to the default stream""" return self._ostream def store(self, istream): """ Create a new object in the database :return: the input istream object with its sha set to its corresponding value :param istream: IStream compatible instance. If its sha is already set to a value, the object will just be stored in the our database format, in which case the input stream is expected to be in object format ( header + contents ). :raise IOError: if data could not be written""" raise NotImplementedError("To be implemented in subclass") def store_async(self, reader): """ Create multiple new objects in the database asynchronously. The method will return right away, returning an output channel which receives the results as they are computed. :return: Channel yielding your IStream which served as input, in any order. The IStreams sha will be set to the sha it received during the process, or its error attribute will be set to the exception informing about the error. :param reader: async.Reader yielding IStream instances. The same instances will be used in the output channel as were received in by the Reader. **Note:** As some ODB implementations implement this operation atomic, they might abort the whole operation if one item could not be processed. Hence check how many items have actually been produced.""" # base implementation uses store to perform the work task = ChannelThreadTask(reader, str(self.store_async), self.store) return pool.add_task(task) #} END edit interface class FileDBBase(object): """Provides basic facilities to retrieve files of interest, including caching facilities to help mapping hexsha's to objects""" def __init__(self, root_path): """Initialize this instance to look for its files at the given root path All subsequent operations will be relative to this path :raise InvalidDBRoot: **Note:** The base will not perform any accessablity checking as the base might not yet be accessible, but become accessible before the first access.""" super(FileDBBase, self).__init__() self._root_path = root_path #{ Interface def root_path(self): """:return: path at which this db operates""" return self._root_path def db_path(self, rela_path): """ :return: the given relative path relative to our database root, allowing to pontentially access datafiles""" return join(self._root_path, rela_path) #} END interface class CachingDB(object): """A database which uses caches to speed-up access""" #{ Interface def update_cache(self, force=False): """ Call this method if the underlying data changed to trigger an update of the internal caching structures. :param force: if True, the update must be performed. Otherwise the implementation may decide not to perform an update if it thinks nothing has changed. :return: True if an update was performed as something change indeed""" # END interface def _databases_recursive(database, output): """Fill output list with database from db, in order. Deals with Loose, Packed and compound databases.""" if isinstance(database, CompoundDB): compounds = list() dbs = database.databases() output.extend(db for db in dbs if not isinstance(db, CompoundDB)) for cdb in (db for db in dbs if isinstance(db, CompoundDB)): _databases_recursive(cdb, output) else: output.append(database) # END handle database type class CompoundDB(ObjectDBR, LazyMixin, CachingDB): """A database which delegates calls to sub-databases. Databases are stored in the lazy-loaded _dbs attribute. Define _set_cache_ to update it with your databases""" def _set_cache_(self, attr): if attr == '_dbs': self._dbs = list() elif attr == '_db_cache': self._db_cache = dict() else: super(CompoundDB, self)._set_cache_(attr) def _db_query(self, sha): """:return: database containing the given 20 byte sha :raise BadObject:""" # most databases use binary representations, prevent converting # it everytime a database is being queried try: return self._db_cache[sha] except KeyError: pass # END first level cache for db in self._dbs: if db.has_object(sha): self._db_cache[sha] = db return db # END for each database raise BadObject(sha) #{ ObjectDBR interface def has_object(self, sha): try: self._db_query(sha) return True except BadObject: return False # END handle exceptions def info(self, sha): return self._db_query(sha).info(sha) def stream(self, sha): return self._db_query(sha).stream(sha) def size(self): """:return: total size of all contained databases""" return reduce(lambda x,y: x+y, (db.size() for db in self._dbs), 0) def sha_iter(self): return chain(*(db.sha_iter() for db in self._dbs)) #} END object DBR Interface #{ Interface def databases(self): """:return: tuple of database instances we use for lookups""" return tuple(self._dbs) def update_cache(self, force=False): # something might have changed, clear everything self._db_cache.clear() stat = False for db in self._dbs: if isinstance(db, CachingDB): stat |= db.update_cache(force) # END if is caching db # END for each database to update return stat def partial_to_complete_sha_hex(self, partial_hexsha): """ :return: 20 byte binary sha1 from the given less-than-40 byte hexsha :param partial_hexsha: hexsha with less than 40 byte :raise AmbiguousObjectName: """ databases = list() _databases_recursive(self, databases) len_partial_hexsha = len(partial_hexsha) if len_partial_hexsha % 2 != 0: partial_binsha = hex_to_bin(partial_hexsha + "0") else: partial_binsha = hex_to_bin(partial_hexsha) # END assure successful binary conversion candidate = None for db in databases: full_bin_sha = None try: if hasattr(db, 'partial_to_complete_sha_hex'): full_bin_sha = db.partial_to_complete_sha_hex(partial_hexsha) else: full_bin_sha = db.partial_to_complete_sha(partial_binsha, len_partial_hexsha) # END handle database type except BadObject: continue # END ignore bad objects if full_bin_sha: if candidate and candidate != full_bin_sha: raise AmbiguousObjectName(partial_hexsha) candidate = full_bin_sha # END handle candidate # END for each db if not candidate: raise BadObject(partial_binsha) return candidate #} END interface gitdb-0.5.4/gitdb/db/git.py0000644000175100017510000000425711573623553014514 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php from base import ( CompoundDB, ObjectDBW, FileDBBase ) from loose import LooseObjectDB from pack import PackedDB from ref import ReferenceDB from gitdb.util import LazyMixin from gitdb.exc import ( InvalidDBRoot, BadObject, AmbiguousObjectName ) import os __all__ = ('GitDB', ) class GitDB(FileDBBase, ObjectDBW, CompoundDB): """A git-style object database, which contains all objects in the 'objects' subdirectory""" # Configuration PackDBCls = PackedDB LooseDBCls = LooseObjectDB ReferenceDBCls = ReferenceDB # Directories packs_dir = 'pack' loose_dir = '' alternates_dir = os.path.join('info', 'alternates') def __init__(self, root_path): """Initialize ourselves on a git objects directory""" super(GitDB, self).__init__(root_path) def _set_cache_(self, attr): if attr == '_dbs' or attr == '_loose_db': self._dbs = list() loose_db = None for subpath, dbcls in ((self.packs_dir, self.PackDBCls), (self.loose_dir, self.LooseDBCls), (self.alternates_dir, self.ReferenceDBCls)): path = self.db_path(subpath) if os.path.exists(path): self._dbs.append(dbcls(path)) if dbcls is self.LooseDBCls: loose_db = self._dbs[-1] # END remember loose db # END check path exists # END for each db type # should have at least one subdb if not self._dbs: raise InvalidDBRoot(self.root_path()) # END handle error # we the first one should have the store method assert loose_db is not None and hasattr(loose_db, 'store'), "First database needs store functionality" # finally set the value self._loose_db = loose_db else: super(GitDB, self)._set_cache_(attr) # END handle attrs #{ ObjectDBW interface def store(self, istream): return self._loose_db.store(istream) def ostream(self): return self._loose_db.ostream() def set_ostream(self, ostream): return self._loose_db.set_ostream(ostream) #} END objectdbw interface gitdb-0.5.4/gitdb/db/loose.py0000644000175100017510000001521311574357221015041 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php from base import ( FileDBBase, ObjectDBR, ObjectDBW ) from gitdb.exc import ( InvalidDBRoot, BadObject, AmbiguousObjectName ) from gitdb.stream import ( DecompressMemMapReader, FDCompressedSha1Writer, FDStream, Sha1Writer ) from gitdb.base import ( OStream, OInfo ) from gitdb.util import ( file_contents_ro_filepath, ENOENT, hex_to_bin, bin_to_hex, exists, chmod, isdir, isfile, remove, mkdir, rename, dirname, basename, join ) from gitdb.fun import ( chunk_size, loose_object_header_info, write_object, stream_copy ) import tempfile import mmap import sys import os __all__ = ( 'LooseObjectDB', ) class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): """A database which operates on loose object files""" # CONFIGURATION # chunks in which data will be copied between streams stream_chunk_size = chunk_size # On windows we need to keep it writable, otherwise it cannot be removed # either new_objects_mode = 0444 if os.name == 'nt': new_objects_mode = 0644 def __init__(self, root_path): super(LooseObjectDB, self).__init__(root_path) self._hexsha_to_file = dict() # Additional Flags - might be set to 0 after the first failure # Depending on the root, this might work for some mounts, for others not, which # is why it is per instance self._fd_open_flags = getattr(os, 'O_NOATIME', 0) #{ Interface def object_path(self, hexsha): """ :return: path at which the object with the given hexsha would be stored, relative to the database root""" return join(hexsha[:2], hexsha[2:]) def readable_db_object_path(self, hexsha): """ :return: readable object path to the object identified by hexsha :raise BadObject: If the object file does not exist""" try: return self._hexsha_to_file[hexsha] except KeyError: pass # END ignore cache misses # try filesystem path = self.db_path(self.object_path(hexsha)) if exists(path): self._hexsha_to_file[hexsha] = path return path # END handle cache raise BadObject(hexsha) def partial_to_complete_sha_hex(self, partial_hexsha): """:return: 20 byte binary sha1 string which matches the given name uniquely :param name: hexadecimal partial name :raise AmbiguousObjectName: :raise BadObject: """ candidate = None for binsha in self.sha_iter(): if bin_to_hex(binsha).startswith(partial_hexsha): # it can't ever find the same object twice if candidate is not None: raise AmbiguousObjectName(partial_hexsha) candidate = binsha # END for each object if candidate is None: raise BadObject(partial_hexsha) return candidate #} END interface def _map_loose_object(self, sha): """ :return: memory map of that file to allow random read access :raise BadObject: if object could not be located""" db_path = self.db_path(self.object_path(bin_to_hex(sha))) try: return file_contents_ro_filepath(db_path, flags=self._fd_open_flags) except OSError,e: if e.errno != ENOENT: # try again without noatime try: return file_contents_ro_filepath(db_path) except OSError: raise BadObject(sha) # didn't work because of our flag, don't try it again self._fd_open_flags = 0 else: raise BadObject(sha) # END handle error # END exception handling try: return mmap.mmap(fd, 0, access=mmap.ACCESS_READ) finally: os.close(fd) # END assure file is closed def set_ostream(self, stream): """:raise TypeError: if the stream does not support the Sha1Writer interface""" if stream is not None and not isinstance(stream, Sha1Writer): raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__) return super(LooseObjectDB, self).set_ostream(stream) def info(self, sha): m = self._map_loose_object(sha) try: type, size = loose_object_header_info(m) return OInfo(sha, type, size) finally: m.close() # END assure release of system resources def stream(self, sha): m = self._map_loose_object(sha) type, size, stream = DecompressMemMapReader.new(m, close_on_deletion = True) return OStream(sha, type, size, stream) def has_object(self, sha): try: self.readable_db_object_path(bin_to_hex(sha)) return True except BadObject: return False # END check existance def store(self, istream): """note: The sha we produce will be hex by nature""" tmp_path = None writer = self.ostream() if writer is None: # open a tmp file to write the data to fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) if istream.binsha is None: writer = FDCompressedSha1Writer(fd) else: writer = FDStream(fd) # END handle direct stream copies # END handle custom writer try: try: if istream.binsha is not None: # copy as much as possible, the actual uncompressed item size might # be smaller than the compressed version stream_copy(istream.read, writer.write, sys.maxint, self.stream_chunk_size) else: # write object with header, we have to make a new one write_object(istream.type, istream.size, istream.read, writer.write, chunk_size=self.stream_chunk_size) # END handle direct stream copies finally: if tmp_path: writer.close() # END assure target stream is closed except: if tmp_path: os.remove(tmp_path) raise # END assure tmpfile removal on error hexsha = None if istream.binsha: hexsha = istream.hexsha else: hexsha = writer.sha(as_hex=True) # END handle sha if tmp_path: obj_path = self.db_path(self.object_path(hexsha)) obj_dir = dirname(obj_path) if not isdir(obj_dir): mkdir(obj_dir) # END handle destination directory # rename onto existing doesn't work on windows if os.name == 'nt' and isfile(obj_path): remove(obj_path) # END handle win322 rename(tmp_path, obj_path) # make sure its readable for all ! It started out as rw-- tmp file # but needs to be rwrr chmod(obj_path, self.new_objects_mode) # END handle dry_run istream.binsha = hex_to_bin(hexsha) return istream def sha_iter(self): # find all files which look like an object, extract sha from there for root, dirs, files in os.walk(self.root_path()): root_base = basename(root) if len(root_base) != 2: continue for f in files: if len(f) != 38: continue yield hex_to_bin(root_base + f) # END for each file # END for each walk iteration def size(self): return len(tuple(self.sha_iter())) gitdb-0.5.4/gitdb/db/ref.py0000644000175100017510000000421111573623553014473 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php from base import ( CompoundDB, ) import os __all__ = ('ReferenceDB', ) class ReferenceDB(CompoundDB): """A database consisting of database referred to in a file""" # Configuration # Specifies the object database to use for the paths found in the alternates # file. If None, it defaults to the GitDB ObjectDBCls = None def __init__(self, ref_file): super(ReferenceDB, self).__init__() self._ref_file = ref_file def _set_cache_(self, attr): if attr == '_dbs': self._dbs = list() self._update_dbs_from_ref_file() else: super(ReferenceDB, self)._set_cache_(attr) # END handle attrs def _update_dbs_from_ref_file(self): dbcls = self.ObjectDBCls if dbcls is None: # late import from git import GitDB dbcls = GitDB # END get db type # try to get as many as possible, don't fail if some are unavailable ref_paths = list() try: ref_paths = [l.strip() for l in open(self._ref_file, 'r').readlines()] except (OSError, IOError): pass # END handle alternates ref_paths_set = set(ref_paths) cur_ref_paths_set = set(db.root_path() for db in self._dbs) # remove existing for path in (cur_ref_paths_set - ref_paths_set): for i, db in enumerate(self._dbs[:]): if db.root_path() == path: del(self._dbs[i]) continue # END del matching db # END for each path to remove # add new # sort them to maintain order added_paths = sorted(ref_paths_set - cur_ref_paths_set, key=lambda p: ref_paths.index(p)) for path in added_paths: try: db = dbcls(path) # force an update to verify path if isinstance(db, CompoundDB): db.databases() # END verification self._dbs.append(db) except Exception, e: # ignore invalid paths or issues pass # END for each path to add def update_cache(self, force=False): # re-read alternates and update databases self._update_dbs_from_ref_file() return super(ReferenceDB, self).update_cache(force) gitdb-0.5.4/gitdb/db/mem.py0000644000175100017510000000631111575506132014473 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Contains the MemoryDatabase implementation""" from loose import LooseObjectDB from base import ( ObjectDBR, ObjectDBW ) from gitdb.base import ( OStream, IStream, ) from gitdb.exc import ( BadObject, UnsupportedOperation ) from gitdb.stream import ( ZippedStoreShaWriter, DecompressMemMapReader, ) from cStringIO import StringIO __all__ = ("MemoryDB", ) class MemoryDB(ObjectDBR, ObjectDBW): """A memory database stores everything to memory, providing fast IO and object retrieval. It should be used to buffer results and obtain SHAs before writing it to the actual physical storage, as it allows to query whether object already exists in the target storage before introducing actual IO **Note:** memory is currently not threadsafe, hence the async methods cannot be used for storing""" def __init__(self): super(MemoryDB, self).__init__() self._db = LooseObjectDB("path/doesnt/matter") # maps 20 byte shas to their OStream objects self._cache = dict() def set_ostream(self, stream): raise UnsupportedOperation("MemoryDB's always stream into memory") def store(self, istream): zstream = ZippedStoreShaWriter() self._db.set_ostream(zstream) istream = self._db.store(istream) zstream.close() # close to flush zstream.seek(0) # don't provide a size, the stream is written in object format, hence the # header needs decompression decomp_stream = DecompressMemMapReader(zstream.getvalue(), close_on_deletion=False) self._cache[istream.binsha] = OStream(istream.binsha, istream.type, istream.size, decomp_stream) return istream def store_async(self, reader): raise UnsupportedOperation("MemoryDBs cannot currently be used for async write access") def has_object(self, sha): return sha in self._cache def info(self, sha): # we always return streams, which are infos as well return self.stream(sha) def stream(self, sha): try: ostream = self._cache[sha] # rewind stream for the next one to read ostream.stream.seek(0) return ostream except KeyError: raise BadObject(sha) # END exception handling def size(self): return len(self._cache) def sha_iter(self): return self._cache.iterkeys() #{ Interface def stream_copy(self, sha_iter, odb): """Copy the streams as identified by sha's yielded by sha_iter into the given odb The streams will be copied directly **Note:** the object will only be written if it did not exist in the target db :return: amount of streams actually copied into odb. If smaller than the amount of input shas, one or more objects did already exist in odb""" count = 0 for sha in sha_iter: if odb.has_object(sha): continue # END check object existance ostream = self.stream(sha) # compressed data including header sio = StringIO(ostream.stream.data()) istream = IStream(ostream.type, ostream.size, sio, sha) odb.store(istream) count += 1 # END for each sha return count #} END interface gitdb-0.5.4/gitdb/db/pack.py0000644000175100017510000001451311575506132014636 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Module containing a database to deal with packs""" from base import ( FileDBBase, ObjectDBR, CachingDB ) from gitdb.util import LazyMixin from gitdb.exc import ( BadObject, UnsupportedOperation, AmbiguousObjectName ) from gitdb.pack import PackEntity import os import glob __all__ = ('PackedDB', ) #{ Utilities class PackedDB(FileDBBase, ObjectDBR, CachingDB, LazyMixin): """A database operating on a set of object packs""" # sort the priority list every N queries # Higher values are better, performance tests don't show this has # any effect, but it should have one _sort_interval = 500 def __init__(self, root_path): super(PackedDB, self).__init__(root_path) # list of lists with three items: # * hits - number of times the pack was hit with a request # * entity - Pack entity instance # * sha_to_index - PackIndexFile.sha_to_index method for direct cache query # self._entities = list() # lazy loaded list self._hit_count = 0 # amount of hits self._st_mtime = 0 # last modification data of our root path def _set_cache_(self, attr): if attr == '_entities': self._entities = list() self.update_cache(force=True) # END handle entities initialization def _sort_entities(self): self._entities.sort(key=lambda l: l[0], reverse=True) def _pack_info(self, sha): """:return: tuple(entity, index) for an item at the given sha :param sha: 20 or 40 byte sha :raise BadObject: **Note:** This method is not thread-safe, but may be hit in multi-threaded operation. The worst thing that can happen though is a counter that was not incremented, or the list being in wrong order. So we safe the time for locking here, lets see how that goes""" # presort ? if self._hit_count % self._sort_interval == 0: self._sort_entities() # END update sorting for item in self._entities: index = item[2](sha) if index is not None: item[0] += 1 # one hit for you self._hit_count += 1 # general hit count return (item[1], index) # END index found in pack # END for each item # no hit, see whether we have to update packs # NOTE: considering packs don't change very often, we safe this call # and leave it to the super-caller to trigger that raise BadObject(sha) #{ Object DB Read def has_object(self, sha): try: self._pack_info(sha) return True except BadObject: return False # END exception handling def info(self, sha): entity, index = self._pack_info(sha) return entity.info_at_index(index) def stream(self, sha): entity, index = self._pack_info(sha) return entity.stream_at_index(index) def sha_iter(self): sha_list = list() for entity in self.entities(): index = entity.index() sha_by_index = index.sha for index in xrange(index.size()): yield sha_by_index(index) # END for each index # END for each entity def size(self): sizes = [item[1].index().size() for item in self._entities] return reduce(lambda x,y: x+y, sizes, 0) #} END object db read #{ object db write def store(self, istream): """Storing individual objects is not feasible as a pack is designed to hold multiple objects. Writing or rewriting packs for single objects is inefficient""" raise UnsupportedOperation() def store_async(self, reader): # TODO: add ObjectDBRW before implementing this raise NotImplementedError() #} END object db write #{ Interface def update_cache(self, force=False): """ Update our cache with the acutally existing packs on disk. Add new ones, and remove deleted ones. We keep the unchanged ones :param force: If True, the cache will be updated even though the directory does not appear to have changed according to its modification timestamp. :return: True if the packs have been updated so there is new information, False if there was no change to the pack database""" stat = os.stat(self.root_path()) if not force and stat.st_mtime <= self._st_mtime: return False # END abort early on no change self._st_mtime = stat.st_mtime # packs are supposed to be prefixed with pack- by git-convention # get all pack files, figure out what changed pack_files = set(glob.glob(os.path.join(self.root_path(), "pack-*.pack"))) our_pack_files = set(item[1].pack().path() for item in self._entities) # new packs for pack_file in (pack_files - our_pack_files): # init the hit-counter/priority with the size, a good measure for hit- # probability. Its implemented so that only 12 bytes will be read entity = PackEntity(pack_file) self._entities.append([entity.pack().size(), entity, entity.index().sha_to_index]) # END for each new packfile # removed packs for pack_file in (our_pack_files - pack_files): del_index = -1 for i, item in enumerate(self._entities): if item[1].pack().path() == pack_file: del_index = i break # END found index # END for each entity assert del_index != -1 del(self._entities[del_index]) # END for each removed pack # reinitialize prioritiess self._sort_entities() return True def entities(self): """:return: list of pack entities operated upon by this database""" return [ item[1] for item in self._entities ] def partial_to_complete_sha(self, partial_binsha, canonical_length): """:return: 20 byte sha as inferred by the given partial binary sha :param partial_binsha: binary sha with less than 20 bytes :param canonical_length: length of the corresponding canonical representation. It is required as binary sha's cannot display whether the original hex sha had an odd or even number of characters :raise AmbiguousObjectName: :raise BadObject: """ candidate = None for item in self._entities: item_index = item[1].index().partial_sha_to_index(partial_binsha, canonical_length) if item_index is not None: sha = item[1].index().sha(item_index) if candidate and candidate != sha: raise AmbiguousObjectName(partial_binsha) candidate = sha # END handle full sha could be found # END for each entity if candidate: return candidate # still not found ? raise BadObject(partial_binsha) #} END interface gitdb-0.5.4/gitdb/db/__init__.py0000644000175100017510000000050411573623553015457 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php from base import * from loose import * from mem import * from pack import * from git import * from ref import * gitdb-0.5.4/gitdb/__init__.py0000644000175100017510000000177311604623351015072 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Initialize the object database module""" import sys import os #{ Initialization def _init_externals(): """Initialize external projects by putting them into the path""" for module in ('async', 'smmap'): sys.path.append(os.path.join(os.path.dirname(__file__), 'ext', module)) try: __import__(module) except ImportError: raise ImportError("'%s' could not be imported, assure it is located in your PYTHONPATH" % module) #END verify import #END handel imports #} END initialization _init_externals() __author__ = "Sebastian Thiel" __contact__ = "byronimo@gmail.com" __homepage__ = "https://github.com/gitpython-developers/gitdb" version_info = (0, 5, 4) __version__ = '.'.join(str(i) for i in version_info) # default imports from db import * from base import * from stream import * gitdb-0.5.4/gitdb/util.py0000644000175100017510000002530111604621525014302 0ustar byronbyron# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php import binascii import os import mmap import sys import errno from cStringIO import StringIO # in py 2.4, StringIO is only StringI, without write support. # Hence we must use the python implementation for this if sys.version_info[1] < 5: from StringIO import StringIO # END handle python 2.4 try: import async.mod.zlib as zlib except ImportError: import zlib # END try async zlib from async import ThreadPool from smmap import ( StaticWindowMapManager, SlidingWindowMapManager, SlidingWindowMapBuffer ) # initialize our global memory manager instance # Use it to free cached (and unused) resources. if sys.version_info[1] < 6: mman = StaticWindowMapManager() else: mman = SlidingWindowMapManager() #END handle mman try: import hashlib except ImportError: import sha try: from struct import unpack_from except ImportError: from struct import unpack, calcsize __calcsize_cache = dict() def unpack_from(fmt, data, offset=0): try: size = __calcsize_cache[fmt] except KeyError: size = calcsize(fmt) __calcsize_cache[fmt] = size # END exception handling return unpack(fmt, data[offset : offset + size]) # END own unpack_from implementation #{ Globals # A pool distributing tasks, initially with zero threads, hence everything # will be handled in the main thread pool = ThreadPool(0) #} END globals #{ Aliases hex_to_bin = binascii.a2b_hex bin_to_hex = binascii.b2a_hex # errors ENOENT = errno.ENOENT # os shortcuts exists = os.path.exists mkdir = os.mkdir chmod = os.chmod isdir = os.path.isdir isfile = os.path.isfile rename = os.rename remove = os.remove dirname = os.path.dirname basename = os.path.basename join = os.path.join read = os.read write = os.write close = os.close fsync = os.fsync # constants NULL_HEX_SHA = "0"*40 NULL_BIN_SHA = "\0"*20 #} END Aliases #{ compatibility stuff ... class _RandomAccessStringIO(object): """Wrapper to provide required functionality in case memory maps cannot or may not be used. This is only really required in python 2.4""" __slots__ = '_sio' def __init__(self, buf=''): self._sio = StringIO(buf) def __getattr__(self, attr): return getattr(self._sio, attr) def __len__(self): return len(self.getvalue()) def __getitem__(self, i): return self.getvalue()[i] def __getslice__(self, start, end): return self.getvalue()[start:end] #} END compatibility stuff ... #{ Routines def make_sha(source=''): """A python2.4 workaround for the sha/hashlib module fiasco **Note** From the dulwich project """ try: return hashlib.sha1(source) except NameError: sha1 = sha.sha(source) return sha1 def allocate_memory(size): """:return: a file-protocol accessible memory block of the given size""" if size == 0: return _RandomAccessStringIO('') # END handle empty chunks gracefully try: return mmap.mmap(-1, size) # read-write by default except EnvironmentError: # setup real memory instead # this of course may fail if the amount of memory is not available in # one chunk - would only be the case in python 2.4, being more likely on # 32 bit systems. return _RandomAccessStringIO("\0"*size) # END handle memory allocation def file_contents_ro(fd, stream=False, allow_mmap=True): """:return: read-only contents of the file represented by the file descriptor fd :param fd: file descriptor opened for reading :param stream: if False, random access is provided, otherwise the stream interface is provided. :param allow_mmap: if True, its allowed to map the contents into memory, which allows large files to be handled and accessed efficiently. The file-descriptor will change its position if this is False""" try: if allow_mmap: # supports stream and random access try: return mmap.mmap(fd, 0, access=mmap.ACCESS_READ) except EnvironmentError: # python 2.4 issue, 0 wants to be the actual size return mmap.mmap(fd, os.fstat(fd).st_size, access=mmap.ACCESS_READ) # END handle python 2.4 except OSError: pass # END exception handling # read manully contents = os.read(fd, os.fstat(fd).st_size) if stream: return _RandomAccessStringIO(contents) return contents def file_contents_ro_filepath(filepath, stream=False, allow_mmap=True, flags=0): """Get the file contents at filepath as fast as possible :return: random access compatible memory of the given filepath :param stream: see ``file_contents_ro`` :param allow_mmap: see ``file_contents_ro`` :param flags: additional flags to pass to os.open :raise OSError: If the file could not be opened **Note** for now we don't try to use O_NOATIME directly as the right value needs to be shared per database in fact. It only makes a real difference for loose object databases anyway, and they use it with the help of the ``flags`` parameter""" fd = os.open(filepath, os.O_RDONLY|getattr(os, 'O_BINARY', 0)|flags) try: return file_contents_ro(fd, stream, allow_mmap) finally: close(fd) # END assure file is closed def sliding_ro_buffer(filepath, flags=0): """ :return: a buffer compatible object which uses our mapped memory manager internally ready to read the whole given filepath""" return SlidingWindowMapBuffer(mman.make_cursor(filepath), flags=flags) def to_hex_sha(sha): """:return: hexified version of sha""" if len(sha) == 40: return sha return bin_to_hex(sha) def to_bin_sha(sha): if len(sha) == 20: return sha return hex_to_bin(sha) #} END routines #{ Utilities class LazyMixin(object): """ Base class providing an interface to lazily retrieve attribute values upon first access. If slots are used, memory will only be reserved once the attribute is actually accessed and retrieved the first time. All future accesses will return the cached value as stored in the Instance's dict or slot. """ __slots__ = tuple() def __getattr__(self, attr): """ Whenever an attribute is requested that we do not know, we allow it to be created and set. Next time the same attribute is reqeusted, it is simply returned from our dict/slots. """ self._set_cache_(attr) # will raise in case the cache was not created return object.__getattribute__(self, attr) def _set_cache_(self, attr): """ This method should be overridden in the derived class. It should check whether the attribute named by attr can be created and cached. Do nothing if you do not know the attribute or call your subclass The derived class may create as many additional attributes as it deems necessary in case a git command returns more information than represented in the single attribute.""" pass class LockedFD(object): """ This class facilitates a safe read and write operation to a file on disk. If we write to 'file', we obtain a lock file at 'file.lock' and write to that instead. If we succeed, the lock file will be renamed to overwrite the original file. When reading, we obtain a lock file, but to prevent other writers from succeeding while we are reading the file. This type handles error correctly in that it will assure a consistent state on destruction. **note** with this setup, parallel reading is not possible""" __slots__ = ("_filepath", '_fd', '_write') def __init__(self, filepath): """Initialize an instance with the givne filepath""" self._filepath = filepath self._fd = None self._write = None # if True, we write a file def __del__(self): # will do nothing if the file descriptor is already closed if self._fd is not None: self.rollback() def _lockfilepath(self): return "%s.lock" % self._filepath def open(self, write=False, stream=False): """ Open the file descriptor for reading or writing, both in binary mode. :param write: if True, the file descriptor will be opened for writing. Other wise it will be opened read-only. :param stream: if True, the file descriptor will be wrapped into a simple stream object which supports only reading or writing :return: fd to read from or write to. It is still maintained by this instance and must not be closed directly :raise IOError: if the lock could not be retrieved :raise OSError: If the actual file could not be opened for reading **note** must only be called once""" if self._write is not None: raise AssertionError("Called %s multiple times" % self.open) self._write = write # try to open the lock file binary = getattr(os, 'O_BINARY', 0) lockmode = os.O_WRONLY | os.O_CREAT | os.O_EXCL | binary try: fd = os.open(self._lockfilepath(), lockmode, 0600) if not write: os.close(fd) else: self._fd = fd # END handle file descriptor except OSError: raise IOError("Lock at %r could not be obtained" % self._lockfilepath()) # END handle lock retrieval # open actual file if required if self._fd is None: # we could specify exlusive here, as we obtained the lock anyway try: self._fd = os.open(self._filepath, os.O_RDONLY | binary) except: # assure we release our lockfile os.remove(self._lockfilepath()) raise # END handle lockfile # END open descriptor for reading if stream: # need delayed import from stream import FDStream return FDStream(self._fd) else: return self._fd # END handle stream def commit(self): """When done writing, call this function to commit your changes into the actual file. The file descriptor will be closed, and the lockfile handled. **Note** can be called multiple times""" self._end_writing(successful=True) def rollback(self): """Abort your operation without any changes. The file descriptor will be closed, and the lock released. **Note** can be called multiple times""" self._end_writing(successful=False) def _end_writing(self, successful=True): """Handle the lock according to the write mode """ if self._write is None: raise AssertionError("Cannot end operation if it wasn't started yet") if self._fd is None: return os.close(self._fd) self._fd = None lockfile = self._lockfilepath() if self._write and successful: # on windows, rename does not silently overwrite the existing one if sys.platform == "win32": if isfile(self._filepath): os.remove(self._filepath) # END remove if exists # END win32 special handling os.rename(lockfile, self._filepath) # assure others can at least read the file - the tmpfile left it at rw-- # We may also write that file, on windows that boils down to a remove- # protection as well chmod(self._filepath, 0644) else: # just delete the file so far, we failed os.remove(lockfile) # END successful handling #} END utilities