debdelta/ 0000755 0000000 0000000 00000000000 12436652141 007514 5 ustar debdelta/debdelta_repo 0000755 0000000 0000000 00000024336 12436652141 012243 0 ustar #!/usr/bin/python
"""
debdelta_repo
Copyright (c) 2011 A. Mennucci
License: GNU GPL v2
"""
#TODO this scheleton does not handle 'security', where some old versions of the packages are in
# a different DISTTOKEN
import sys , os , tempfile , string ,getopt , tarfile , shutil , time, traceback, stat, pwd, grp
from stat import ST_SIZE, ST_MTIME, ST_MODE, ST_INO, ST_DEV, S_IMODE, S_IRUSR, S_IWUSR, S_IXUSR
from os.path import abspath
from copy import copy
from types import IntType, StringType, FunctionType, TupleType, ListType, DictType, BufferType
from apt import VersionCompare
__help__usage__ = "Usage: debdelta_repo [OPTION]... "
__help__options__={
"verbose":"-v --verbose\n be verbose, print more informations",
"workspace":"-W WORKSPACE\n directory were all the work is done",
"debrepo":"-D DEBREPO\n directory of the repository of debs",
}
#-R
#--release RELEASE
#is the Debian Release file,
#-d --debug
# print debugging info (not really useful but for the program author)
__help__ = {
None : __help__usage__ +"""[COMMAND] [ARGS]..\n
[command] may be one of --create --add --sos --deltas \n
Use -h [command] for further help on commands""",
'create' : __help__usage__ +"""--create [ARGS]\n
Creates the sqlite database SQL DB that is used to store packages' info.""",
'add' : __help__usage__ +"""--add name version arch filename disttoken
or alternatively
--add --stdin
that reads from stdin lines with the above five arguments, tab separated
it stores in the database the fact that name,version,arch has entered disttoken,
and the package file is at filename (if nonabsolute, -D is used)""",
'sos' : __help__usage__ +"""--sos filename
saves the filename somewhere""",
'deltas' : __help__usage__ +"""
create all deltas""",
}
def help(cmd=None):
if cmd and cmd[:2] == '--': cmd = cmd[2:]
sys.stderr.write(__help__.get(cmd," UNKNOWN COMMAND ") + "\n")
if cmd:
sys.stderr.write("\nOptions:\n " +string.join( __help__options__.values(),"\n ")+"\n")
try:
from pysqlite2 import dbapi2 as dbapi
except ImportError:
dbapi = None
if dbapi != None:
# ===== sqlite machinery
def convert_blob(s):
return s #this is always a string
# Register the adapter
#sqlite.register_adapter(StringType, adapt_blob)
# Register the converter
dbapi.register_converter("blob", convert_blob)
dbapi.register_converter("text", convert_blob)
sql_scheme="""
create table package (
id integer unique primary key autoincrement,
name text,
version text,
arch text,
filename text,
ownfile boolean,
ctime integer
) ;
create table dist (
id integer unique primary key autoincrement,
disttoken text,
package_id integer,
generated boolean,
ctime integer
) ;
CREATE INDEX IF NOT EXISTS package_name ON package ( name );
CREATE INDEX IF NOT EXISTS package_name_arch ON package ( name,arch );
CREATE INDEX IF NOT EXISTS package_filename ON package ( filename );
CREATE INDEX IF NOT EXISTS dist_package_id ON dist ( package_id );
"""
class theSQLdb:
dbname=None
sql_connection=None
sql_cursor=None
def __init__(self,dbname):
assert type(dbname) == StringType
assert os.path.exists(dbname)
self.dbname=dbname
self.sql_connection = dbapi.connect(dbname,
detect_types=dbapi.PARSE_DECLTYPES | dbapi.PARSE_COLNAMES)
self.sql_cursor = self.sql_connection.cursor()
def __del__(self):
self.sql_connection.close()
def commit(self):
self.sql_connection.commit()
def add_one(self,name,version,arch,filename,disttoken,generated=0,ownfile=0,ctime=None):
if ctime==None: ctime=int(time.time())
self.sql_cursor.execute('SELECT name,version,arch,id FROM package WHERE filename = ? ',\
(filename,))
tp=self.sql_cursor.fetchone()
if tp:
if ( tp[0] != name or tp[1] != version or tp[2] != arch):
sys.stderr.write('Filename already in package database as: %s\n' % repr(tp))
return
tpid=tp[3]
else:
self.sql_cursor.execute('INSERT INTO package VALUES (null, ?, ?, ?, ?, ?, ?)',\
(name,version,arch,filename,ownfile,ctime))
tpid=self.sql_cursor.lastrowid
z=self.sql_cursor.fetchone()
if z:
sys.stderr.write('Warning two entries with same filename?\n')
self.sql_cursor.execute('SELECT id FROM dist WHERE package_id = ? AND disttoken = ? ', (tpid,disttoken))
td=self.sql_cursor.fetchone()
if td:
sys.stderr.write('Package,version,arch already in dist database for this disttoken\n')
#FIXME we may have added a package and no dist?
return
self.sql_cursor.execute('INSERT INTO dist VALUES (null, ?, ?, ?, ?)',\
(disttoken,tpid,generated,ctime))
def package_versions(self,name,disttoken,arch=None,generated=None):
"returns a list of id,name,arch,version"
sql_cursor1 = self.sql_connection.cursor()
sql_cursor2 = self.sql_connection.cursor()
if generated==None:
sql_cursor1.execute('SELECT package_id FROM dist WHERE disttoken = ? ',(disttoken,))
elif generated:
sql_cursor1.execute('SELECT package_id FROM dist WHERE disttoken = ? AND generate = 1',(disttoken,))
else:
sql_cursor1.execute('SELECT package_id FROM dist WHERE disttoken = ? AND generate = 0',(disttoken,))
z=[]
for a in sql_cursor1:
if arch:
sql_cursor2.execute('SELECT id,name,arch,version FROM package WHERE id = ? AND arch = ?',\
(a[0],arch))
else:
sql_cursor2.execute('SELECT id,name,arch,version FROM package WHERE id = ?',(a[0]))
a=sql_cursor2.fetchall()
z=z+a
return z
def create_deltas(self):
namearchtokens=[]
sql_cursor1 = self.sql_connection.cursor()
sql_cursor2 = self.sql_connection.cursor()
sql_cursor1.execute('SELECT package_id,disttoken FROM dist WHERE generated = 0 ')
for n in sql_cursor1:
#TODO use joins
sql_cursor2.execute('SELECT name,arch FROM package WHERE id = ? ',(n[0],))
for z in sql_cursor2:
a=list(z)+[n[1]] #name,arch,disttoken
if a not in namearchtokens:
namearchtokens.append(a)
for n in namearchtokens:
versions=self.package_versions(n[0],n[2],n[1])
#TODO this is a very good place to delete extra, very old versions
if len(versions) == 1:
print 'Only one version for ',n,versions
else:
print ' Creating deltas for ',n
def _cmp_(a,b):
return VersionCompare(a[3],b[3])
versions.sort(cmp=_cmp_)
new=versions.pop()
for a in versions:
print ' Create delta from ',a[3],' to ',new[3]
#TODO mark all above as 'generated=1' when done, if successful
def create(dbname):
if os.path.exists(dbname):
sys.stderr.write(sys.argv[0]+': will not overwrite already existing '+dbname+'\n')
sys.exit(1)
os.popen("sqlite3 '"+dbname+"'",'w').write(sql_scheme)
def add(dbname, argv, stdin=None):
H=theSQLdb(dbname)
if stdin:
for a in sys.stdin:
if not a or a[0] == '#' :
continue
b=string.split(a,'\t')
if len(b) == 5:
H.add_one(*b)
else: sys.stderr.write('It is not a tab separated list of 5 elements: %s\n'%repr(a))
else:
if len(argv) == 5:
H.add_one(*argv)
else: sys.stderr.write('It was not given 5 arguments: %s\n'%repr(argv))
H.commit()
def deltas(dbname):
H=theSQLdb(dbname)
H.create_deltas()
def sos(dbname, workspace, argv):
H=theSQLdb(dbname)
if len(argv) != 1:
sys.stderr.write('It was not given 1 arguments: %s\n'%repr(argv))
sys.exit(1)
H.sql_cursor.execute('SELECT id,name,version,arch FROM package WHERE filename = ? ',argv)
a=H.sql_cursor.fetchone()
if not a:
sys.stderr.write('Filename not found: %s\n'%repr(argv))
return
print 'WILL SAVE',a,'SOMEWHERE INSIDE',workspace,' AND UPDATE SQL ACCORDINGLY'
#in particular, will mark it as 'owned', so it will be deleted when it will be old
if __name__ == '__main__':
#argv = debugging_argv or sys.argv
if len(sys.argv) <= 1:
help()
raise SystemExit(0)
DEBUG = 0
VERBOSE = 0
JUSTHELP=False
WORKSPACE=None
STDIN=False
cmd=None
try:
( opts, argv ) = getopt.getopt(sys.argv[1:], 'hvdW:' ,
('help','debug','verbose','workspace=','add','stdin','sos','create','deltas') )
except getopt.GetoptError,a:
sys.stderr.write(sys.argv[0] +': '+ str(a)+'\n')
raise SystemExit(2)
for o , v in opts :
if o == '-v' or o == '--verbose' :
VERBOSE += 1
elif o == '-d' or o == '--debug' :
DEBUG += 1
elif o == '--help' or o == '-h':
JUSTHELP = True
elif o == '-W' or o == '--workspace':
WORKSPACE=v
elif o == '--stdin':
STDIN=True
elif o[:2] == '--' and o[2:] in __help__.keys():
if cmd :
sys.stderr.write(' option ',o,'is unacceptable after',cmd)
raise SystemExit(1)
else:
cmd=o[2:]
else:
sys.stderr.write(' option '+o+'is unknown, try --help')
raise SystemExit(1)
if JUSTHELP:
help(cmd)
raise SystemExit(0)
if not WORKSPACE:
sys.stderr.write('Need a workspace. Use -W . Read --help .\n')
raise SystemExit(1)
dbname=os.path.join(WORKSPACE,'theSQLdb')
if cmd == "create":
create(dbname)
elif cmd == 'add':
add(dbname,argv,STDIN)
elif cmd == 'sos':
sos(dbname,WORKSPACE,argv)
elif cmd == 'deltas':
deltas(dbname)
else:
sys.stderr.write("Sorry this command is yet unimplemented: "+cmd+'\n')
sys.exit(1)
debdelta/doc/ 0000755 0000000 0000000 00000000000 12436652150 010261 5 ustar debdelta/doc/debdelta_suite.xml 0000644 0000000 0000000 00000113017 12436652141 013763 0 ustar
The debdelta suite
Andrea C. G.
Mennucci
April 5, 2005
debdelta is an application suite designed to compute
changes between Debian packages. These changes (that we will call
'deltas') are similar to the output of the "diff" program in that
they may be used to store and transmit only the changes between
Debian packages. This suite contains 'debdelta-upgrade', that
downloads deltas and use them to create all Debian packages needed
for an 'apt-get upgrade'.
2006-2011
Overview
The debdelta application suite is really composed of different
applications.
debdelta
debdelta computes the delta, that is, a file
that encodes the difference between two Debian packages.
Example:
$ a=/var/cache/apt/archives
$ debdelta -v $a/emacs-snapshot-common_1%3a20060512-1_all.deb \
$a/emacs-snapshot-common_1%3a20060518-1_all.deb /tmp/emacs.debdelta
the result is:
deb delta is 12.5% of deb ; that is, 15452kB would be saved
debpatch
debpatch
can use the delta file and a copy of the old Debian package
to recreate the new Debian package. (This process is called "applying
the delta file"). If the old Debian package is not available, but is
installed in the host, it can use the installed data; in this case,
'/' is used in lieu of the old .deb.
Example:
$ debpatch -A /tmp/emacs.debdelta / /tmp/emacs.deb
debdeltas
debdeltas
can be used to generate deltas for many debs at once.
It will generate delta files with names such as
package_old-version_new-version_architecture.debdelta.
If the delta exceeds ~70% of the deb, 'debdeltas' will delete it
and leave a stamp of the form
package_old-version_new-version_architecture.debdelta-too-big.
Example usages are in the man page; see also .
debdelta-upgrade
debdelta-upgrade
will download necessary deltas
and apply them to create debs for a
successive apt-get upgrade.
The deltas are available for upgrades in
'stable' , 'stable-security' , 'testing', 'unstable' and 'experimental',
for i386 and amd64.
Example usage:
# apt-get update && debdelta-upgrade && apt-get upgrade
If run by a non-root user, debs are saved in /tmp/archives : do not
forget to move them in /var/cache/apt/archives
debdelta-upgrade will also download .debs for which no delta is
available (this is done in parallel to patching, to maximize
speed). See the explanation of "debdelta-upgrade --deb-policy" in the
man page for more informations and customization on which debs get downloaded.
More informations are in next sections.
debforensic
There is also another bunch of code
(that though was never distributed.... it is available in the GIT repo).
.
debforensics creates
and uses sqlite databases containing information regarding
debian binary packages. debforensics --add
will scan debian packages and add the list of files (and SHA1 hashes
of them) to the database. debforensics --scan
will check a file against multiple databases, to see if that file is part
of any package. debforensics --forensic
will scan a filesystem and list files that are part of a package, and
files that are not (or are missplaced, or have strange permissions....).
If debdelta-upgrade fails to apply a delta, and '-d' is passed,
then a debug file is generated, and then debforensic may be used
to understand what went wrong (theoretically).
Beware: a full database for main/amd64 is
~350MBs, without indexes. So in practice currently I cannot keep
a database in my host.
a delta
The delta is 'ar' archive (see 'man ar').
The delta contains 'info', some data members (named by numbers), a script named 'patch.sh.xxx',
and optional gpg signatures.
The script recreates the new deb. See do_delta_() in the python code for more details.
the info in a delta
a delta first 'ar' member is always named 'info', and is a text file containing some keywords and informations
regarding the delta itself. [TODO add details]
how to apply a delta
TODO WRITEME. You may look into /usr/share/debdelta/debpatch.sh to understand the basics.
debdelta-upgrade service
In June 2006 I set up a delta-upgrading framework, so that people
may upgrade their Debian box using debdelta-upgrade (that downloads
package 'deltas').
This section is an introduction to the framework that is behind
'debdelta-upgrade', and is also used by 'cupt'.
In the following, I will simplify (in places, quite a lot).
The framework
The framework is so organized: I keep up some servers where I use the
program 'debdeltas' to create all the deltas; whereas endusers use the
client 'debdelta-upgrade' to download the deltas and apply them to
produce the debs needed to upgrade their boxes.
In my server, I mirror some repositories, and then I invoke
'debdeltas' to make the deltas between them. I use the
scripts /usr/share/debdelta/debmirror-delta-security
and /usr/share/debdelta/debmirror-marshal-deltas for this.
This generates any delta that may be needed for upgrades
in squeeze,squeeze-security,wheezy,sid,experimental,
for architectures i386 and amd64 (as of Mar 2011); the generated repository of deltas is
more or less 10GB.
The goals
There are two ultimate goals in designing this framework:
SMALL) reduce the size of downloads
(fit for people that pay-by-megabyte);
FAST) speed up the upgrade.
The two goals are unfortunately only marginally compatible. An
example: bsdiff can produce very small deltas, but is quite slow (in
particular with very large files); so currently (2009 on) I use 'xdelta3'
as the backend diffing tool for 'debdeltas' in my server.
Another example is in debs that contain archives ( .gz, , tar.gz
etc etc): I have methods and code to peek inside them, so
the delta become smaller, but the applying gets slower.
The repository structure
The repository of deltas is just a HTTP archive; it is similar to the pool of packages; that is, if
foobar_1_all.deb is stored in
pool/main/f/foobar/ in the repository of debs, then the
delta to upgrade it will be stored in pool/main/f/foobar/foobar_1_2_all.debdelta
in the repository of deltas. Contrary to the repository of debs, a repository of deltas
has no indexes, see . The delta repository is in
http://debdeltas.debian.net/debian-deltas.
The repository creation
Suppose that the unstable archive, on 1st Mar, contains
foobar_1_all.deb (and it is in
pool/main/f/foobar/ ) ; then on 2nd Mar,
foobar_2_all.deb is uploaded; but this
has a flaw (e.g. FTBFS) and so on 3rd Mar
foobar_3_all.deb is uploaded.
On 2nd Mar, the delta server generates
pool/main/f/foobar/foobar_1_2_all.debdelta
On 3rd Mar, the server generates both
pool/main/f/foobar/foobar_1_3_all.debdelta
pool/main/f/foobar/foobar_2_3_all.debdelta.
So, if the end-user Ann upgrades the system on both 2nd and 3rd Mar,
then she uses both foobar_1_2_all.debdelta (on 2nd) and
foobar_2_3_all.debdelta (on 3rd Mar). If the end-user Boe has not
upgraded the system on 2nd Mar, , and he upgrades on 3rd Mar, then on
3rd Mar he uses foobar_1_3_all.debdelta.
size limit
Note that currently the server rejects deltas that exceed 70% of the deb
size: indeed the size gain would be too small, and the time would be
wasted, if you sum the time to download the delta and the time to apply
it (OK, these are run as much as possible in parallel, yet ....).
Also, the server does not generate delta for packages that are smaller than 10KB.
/etc/debdelta/sources.conf
Consider a package that is currently installed. It is characterized by
name installed_version architecture
(unfortunately there is no way to tell from which archive it came
from, but this does not seem to be a problem currently)
Suppose now that a newer version is available somewhere in an archive,
and that the user wishes to upgrade to that version.
The archive Release file contain these info:
Origin , Label , Site, Archive
.
(Note that Archive is called Suite in the Release file).
Example for the security archive:
Origin=Debian
Label=Debian-Security
Archive=stable
Site=security.debian.org
The file /etc/debdelta/sources.conf
, given the above info, determines
the host that should contain the delta for upgrading the package. This
information is called "delta_uri" in that file.
The complete URL for the delta is built adding to the delta_uri a
directory path that mimicks the "pool" structure used in Debian
archives, and appending to it a filename of the form
name_oldversion_newversion_architecture.debdelta.
All this is implemented in the example script contrib/findurl.py .
If the delta is not available at that URL, and
name_oldversion_newversion_architecture.debdelta-too-big
is available, then the delta is too big to be useful.
If neither is present, then, either the delta has not yet been
generated, or it will never be generated... but this is difficult to
know.
indexes
indexes of debs in APT
Let's start examining the situation for debs and APT.
Using indexes for debs is a no-brainer decision: indeed, the client
(i.e. the end user) does not know the list of available debs in the
server, and, even knowing the current list, cannot foresee the future
changes.
So indexes provide needed informations: the packages' descriptions,
versions, dependencies, etc etc; these info are used by apt and the
other frontends.
no indexes of deltas in debdelta
If you then think of deltas, you realize that all requirements above
fall. Firstly there is no description and no dependencies for deltas.
deltas have a "info" section, but that is, as to say, standalone
Of course 'debdelta-upgrade' needs some information to determine if a delta
exists, and to download it; but these information are already available:
the name of the package P
the old version O
the new version N
the architecture A
Once these are known, the URL of the file F can be algorithmically
determined as
URI/POOL/P_O_N_A.debdelta
where URI is determined from
/etc/debdelta/sources.conf
and POOL is the directory in the pool of the package P .
This algorithm is also implemented (quite verbosely) in
contrib/findurl.py in the sources of debdelta.
This is the reason why currently there is no "index of deltas", and
nonetheless 'debdelta-upgrade' works fine (and "cupt" as well).
Adding an index of file would only increase downloads (time and size)
and increase disk usage; with negligeable benefit, if any.
no incremental deltas
Let me add another point that may be unclear. There are no incremental
deltas (and IMHO never will be).
What "incremental" would be, and why it is not
Please recall .
What does not happen currently is what follows:
on 3rd Mar , Boe decides to upgrade, and invokes 'debdelta-upgrade';
then 'debdelta-upgrade' finds foobar_1_2_all.debdelta and
foobar_2_3_all.debdelta , it uses the foremost to generate
foobar_2_all.deb, and in turn it uses this and the second delta to
generate foobar_3_all.deb .
This is not implemented, and it will not, for the following reasons.
The delta size is, on average, 40% of the size of the deb (and this
is getting worse, for different reasons, see ); so two deltas are 80% of the
target deb, and this too much.
It takes time to apply a delta; applying two deltas to produce one
deb takes too much time.
The server does generate the direct delta
foobar_1_3_all.debdelta
:-) so why making things complex when they are easy? :-)
Note also that incremental deltas would
need some index system to be implemented... indeed, Boe
would have no way to know on 3rd Mar that the intermediate
version of foobar between "1" and "3" is "2"; but since
incremental deltas do not exist, then there is no need to
have indexes).
Repository howto
There are (at least) two ways two manage a repository, and run a server that creates the deltas
debmirror --debmarshal
The first way is what I currently use. It is implemented in the script
/usr/share/debdelta/debmirror-marshal-deltas
(a simpler version, much primitive but more readable , is
/usr/share/debdelta/debmirror-delta-security)
Currently I use the complex script that creates deltas for amd64 and
i386, and for lenny squeeze sid experimental ; and the simpler one for
lenny-security.
Let me start outlining how the simple script generate deltas . It is a 3 steps
process.
Lets say that $secdebmir is the directory containg the mirror of the
repository security.debian.org.
--- 1st step
#make copy of current stable-security lists of packages
olddists=${TMPDIR:-/tmp}/oldsecdists-`date +'%F_%H-%M-%S'`
mkdir $olddists
cp -a $secdebmir/dists $olddists
--- 2nd step
call 'debmirror' to update the mirror ; note that I apply a patch to
debmirror so that old debs are not deleted , but moved to a /old_deb
directory
--- 3rd step
call 'debdeltas' to generate deltas , from the state of packages in
$olddists to the current state in $secdebmir , and also wrt what is in
stable.
Note that, for any package that was deleted from the archive, then
'debdeltas' will go fishing for it inside /old_deb .
The more complex script uses the new debmirror --debmarshal
so it keeps 40 old snapshots of the deb archives, and it generates deltas of the current
package version (the "new" version) to the versions in snapshots -10,-20,-30,-40.
hooks and repository of old_debs
I wrote the scheleton for some commands.
debdelta_repo--add name version arch filename disttoken
This first one is to be called by the archive management tool (e.g. DAK) when a new package enters
in a part of the archive (lets say,
package="foobar" version="2" arch="all" and filename="pool/main/f/foobar/foobar_2_all.deb" just entered
disttoken="testing/main/amd64"). That command will add that to a delta queue, so
appropriate deltas will be generated; this command returns almost immediately.
debdelta_repo--delta
This does create all the deltas.
debdelta_repo--sos filename
This will be called by DAK when (before) it does delete a package from the archive;
this command will save that old deb somewhere (indeed it may be needed to generate deltas sometimes in the future).
(It will be up to some piece of debdelta_repo code to manage the repository of old debs, and
delete excess copies).
TODO that scheleton does not handle 'security', where some old versions of the packages are in
a different DISTTOKEN
Goals, tricks, ideas and issues
exact patching
When debpatch or debdelta-upgrade
recreates a .deb, it will be identical to the desired
one (so it may be possible to check it using the
security features in APT
note though that debdelta-upgrade saves the
recontructed debs in /var/cache/apt/archives, and APT does not check
them there, AFAICT). See though .
exact recompression
Suppose a .deb has inside a huge file
/usr/share/doc/foobar/document.info.gz
and this starts with a RCS tag ... then each time it
is released, the file will be different even though
just few bytes were changed. Another examples are manpages that start with the header
containing the version of the command.
So , to get good compression of the difference, I had
to be able to gunzip those files, diff them,
and gzip back them exactly identical (but possibly for headers
the re-gzipped files are identical but for headers,
(indeed gzip headers contain sometimes a timestamp ); but this is not a problem
since the reconstructed gzipeed file is then piped again into 'xdelta3' or 'bsdiff' to rebuild the 'data.tar',
so the header is fixed at that stage
)
For this reason, I studied gzip formats, and I wrote in debdelta
some python code that does the trick (90% of the times...).
This is implemented in the python routine delta_gzipped_files.
speed
some (old) numbers
Warning: this section is referred to experiments done in 2006, and the backend for
delta encoding was 'xdelta'.
On a desktop with CPU Athlon64 3000 and a average hard disk,
$ debdelta mozilla-browser_1.7.8-1sarge3_i386.deb \
mozilla-browser_1.7.8-1sarge6_i386.deb /tmp/m-b.debdelta
processes the 10Mb of mozilla-browser in ~11sec,
that is a speed of ~900kB per second.
Then debpatch applies the above delta in 16sec,
at a speed of ~600kB per second.
Numbers drop in a old PC, or in a notebook (like mine, that has a
Athlon 1600MHz and slow disks), where data are chewed at ~200kB per
second. Still, since I have a ADSL line that downloads at
max 80kB per second, I have a benefit downloading deltas.
In a theoretical example, indeed, to download a 80MB package, it would
take 1000seconds; whereas to download a delta that is 20% of 80MB it
takes 200seconds, and then 80MB / (200kB/sec) = 400seconds to apply
it, for a total of 600seconds. So I may get a "virtual speed" of 80MB /
600sec = 130kB/sec .
Note that delta downloading and delta patching is done in parallel:
if 4 packages as above have to be downloaded, then the total
time for downloading of full debs would be 4000seconds, while the time
for parallel-download-patch-apply-patch may be as low as 1400seconds.
This is a real example of running 'debdelta-upgrade' :
Looking for a delta for libc6 from 2.3.6-9 to 2.3.6-11
Looking for a delta for udev from 0.092-2 to 0.093-1
Patching done, time: 22sec, speed: 204kB/sec, result: libc6_2.3.6-11_i386.deb
Patching done, time: 4sec, speed: 57kB/sec, result: udev_0.093-1_i386.deb
Delta-upgrade download time 28sec speed 21.6k/sec
total time: 53sec; virtual speed: 93.9k/sec.
(Note that the "virtual speed" of 93.9k/sec , while less than the
130kB/sec of the theoretical example above, is still more than the
80kB that my ADSL line would allow).
Of course the above is even better for people with fast disks and/or
slow modems.
Actually, an apt delta method may do a smart decision of how many
deltas to download, and in which order, to optimize the result, (given
the deltas size, the packages size, the downloading speed and the
patching speed).
speeding up
The problem is that the process of applying a delta to create a new
deb is currently slow, even on very fast machines.
One way to overcome is to "parallelize as much as possible".
The best strategy that I can imagine is to keep both the CPU,
the hard disk, and the Internet connection, always maxed up.
This is why 'debdelta-upgrade' has two threads, the "downloading
thread" and the "patching thread". The downloading thread downloads
deltas (ordered by increasing size), and as soon as they are
downloaded, it queues them to be applied in the "patching thread";
whereas as soon as all available deltas are downloaded it starts
downloading some debs, and goes on for as long as the deltas are being
applied in the "patching thread".
Summarizing, the downloading thread keeps Internet busy while the
patching thread keeps the CPU and HDD busy.
Another speedup strategy is embedded inside the deltas
themselves: since bsdiff is a memory hog, when the backend is
bsdiff, I have to divide the data in chunks; this may lower the
compression ratio, but the good point is that the HDD accesses
and the calls to bsdiff can run "in parallel". With newer
xdelta3, xdelta3 can read the original data from a pipe, so the
data are not divided in chunks, but rather continously piped
into xdelta3; so xdelta3 runs at the same time as when the data
are read from HDD.
the 10kb trick
currently, roughly half of the generated deltasthat is, discarding those that
are more than 70% of the corresponding deb are less than 10KB.
debdelta-upgrade downloads deltas in two passes,
in the first pass it tries to download the first 10KB of a delta;
if it gets a complete delta, it immediatly pipes it in the "patching thread queue", otherwise if it gets
only a partial download, it adds it to the download queue; if it gets HTTP404, it
possibly checks for the "toobig" timestamp, and it possibly warns the user.
in the second pass, it downloads the rest of the deltas, and queues them for patching
Why this complex method? because the first 10KBs of a delta contain the info, and those may be used
to actually decide not to download the rest of the delta (if a TODO predictor
decides that it is not worthwhile...).
the choice, the predictor
Which deltas should be downloaded, VS which debs?
Currently there is a rule-of-thumb: the server immediately deletes any
delta that exceeds 70% of the original deb , and it replaces it with
an empty file ending in ".debdelta-too-big". In such cases,
"debdelta-upgrade" will download the deb instead.
See the explanation of "debdelta-upgrade --deb-policy" in the man page
for more info and customization on which debs get downloaded.
Some time ago I tried to do devise a better way to understand when to
download a delta w.r.t. a deb. The code is in the "Predictor" class
.... but I could not reliably predict the final speed of patching, so
currently it is not used.
State of the art
All in all, I still cannot obtain high speeds: so people that have a fast
ADSL Internet connection usually are better
downloading all the debs, and ignoring "debdelta-upgrade" alltogether.
Anyway, the best way to know is to try "debdelta-upgrade -v" and
read the final statistics. See
and for recent developments.
better deb compression is a worse delta
'xdelta3' can reconstruct data at high speed: on nowadays processors, it can process up to 2MB per second;
but, when applying a delta, 'xdelta3' works on uncompressed data.
So if the data is then compressed at a ratio 1/3, then the resulting speed on compressed data
is 700KB/sec. Moreover, time is needed to actually compress the data.
In recent years, 'dpkg' has transitioned from 'data.tar.gz' to 'data.tar.bz2' to 'data.tar.lzma';
each method is better at compressing, but is also slower than the previous one; since it is better at
compressing, it also defeats the ability of 'debdelta' to produce small deltas (wrt the original deb, of course),
and indeed statistics show that deltas are getting larger; since it is slower, it slows down the applying of
deltas as well.
long time recovery
As aforementioned, deltas can rebuild the deb identically to the byte. But the patch.sh script
calls the standard tools 'tail','head','zgip','bzip2','lzma', etc etc to rebuild a delta; so
if the argument calling or output of any of those tools changes, than a delta may become unusable.
As long as deltas are used for the debdelta-upgrade service, this is no big deal: if such a tool changes,
then we can adjust the deltas to it, and there is just some days disruption of the service
this actually already happened some years ago, with libzip
(and people will download debs instead of deltas .... as we used to).
If anybody wants instead to use debdelta to archive debs for long time, (as the archive.debian.org service
was doing), then we should make sure that , at any moment in future, deltas can be applied.
A possible solution would be that deltas should contain, in the info files, the versions of all tools that
are needed for applying. A second solution is that debdelta should keep a standard set of those tools inside the package.
streaming
Let me summarize. When 'debdelta-upgrade' (or 'debpatch') recreates a
deb, one step is reassembling the data.tar part inside it; this part
moreover is compressed (gzip, bzip2 or lately lzma). This
'reassembling and compressing' takes time (both for CPU and for HD),
and is moreover quite useless, since, in short time, 'apt' will call
'dpkg -i' that decompresses and reopens the data.tar in the deb.
It is then reasonable to collapse this two parts, and this would
possibly speed up the upgrade a bit. A first step is
'--format=unzipped'
, a next step may be '--format=preunpacked'
.
Todo
todo list
Prepare an APT method so that
'apt-get upgrade' would actually use deltas.
Some code is already written. See also 2011 Google Summer of Code.
As in . It would be nice if debdelta-upgrade would actually choose if
download a delta and use it to create the .deb
download the deb
depending on which one would be faster.
Unfortunately, this decision must depend on a good model
to predict the speed of patching... and this I still cannot
achieve.
in debdelta-upgrade, have as many "patching thread" as there are cores
upgrade debdelta-upgrade to newer libapt
support multiarch
collect data, benchmark! (some debdelta behaviours are coded in magic numbers that I got
from thumb reasoning on small datasets)
support long time exact recovery : embed a copy of gzip, libzip, bzip2 and lzma in debdelta??
things are getting worse
W.r.t. to when I started deploying debdelta, things got worse, for two reasons,
one problem is
delta backends are bad at compressing a binary that
was compiled from the same source but with twi different compilers; see in particular
the Google Courgette project, and compare it with
the problems I encountered lately when Debian switched from GCC 4.4 to 4.5,
when it happened that the binaries were so different that
the compression of the new binary with LZMA would be smaller than the BSDIFF of
the old and the new binary (!!).
Unfortunately it seems that Google Courgette was hit with
a patent infringment
so we should study how to reduce the size of deltas, and/or making them faster (possibly implementing lzma in xdelta3;
or automatically choosing 'bsdiff' vs 'xdelta3' depending on the situation).
debdelta/doc/Makefile 0000644 0000000 0000000 00000000347 12436652141 011725 0 ustar all: html/index.html debdelta_suite.pdf
html:
mkdir html
html/index.html: html debdelta_suite.xml
rm html/*.html
docbook2html -o html debdelta_suite.xml
debdelta_suite.pdf: debdelta_suite.xml
docbook2pdf debdelta_suite.xml
debdelta/doc/debdelta_suite.txt 0000644 0000000 0000000 00000011425 12436652150 014002 0 ustar
The debdelta suiteCopyright © 2006-2011
debdelta is an application suite designed to compute
changes between Debian packages. These changes (that we will call
'deltas') are similar to the output of the "diff" program in that
they may be used to store and transmit only the changes between
Debian packages. This suite contains 'debdelta-upgrade', that
downloads deltas and use them to create all Debian packages needed
for an 'apt-get upgrade'.
The debdelta application suite is really composed of different applications.
'debdelta' computes the delta, that is, a file that encodes the difference
between two Debian packages.
Example:
$ a=/var/cache/apt/archives
$ debdelta -v $a/emacs-snapshot-common_1%3a20060512-1_all.deb \
$a/emacs-snapshot-common_1%3a20060518-1_all.deb /tmp/emacs.debdelta
the result is:
deb delta is 12.5% of deb ; that is, 15452kB would be saved
'debpatch' can use the delta file and a copy of the old Debian package
to recreate the new Debian package. (This process is called "applying
the delta file"). If the old Debian package is not available, but is
installed in the host, it can use the installed data; in this case,
'/' is used in lieu of the old .deb.
Example:
$ debpatch -A /tmp/emacs.debdelta / /tmp/emacs.deb
'debdeltas' can be used to generate deltas for many debs at once.
It will generate delta files with names such as
package_old-version_new-version_architecture.debdelta
and put them in the directory where the new .deb is.
If the delta exceeds ~70% of the deb, 'debdeltas' will delete it
and leave a stamp of the form
package_old-version_new-version_architecture.debdelta-too-big
Example usages are in the man page; see also
the scripts /usr/share/debdelta/debmirror-delta-security
and /usr/share/debdelta/debmirror-deltas
This command will download necessary deltas
and apply them to create debs for an 'apt-get upgrade' .
The deltas are available for upgrades in
'stable' , 'stable-security' , 'testing', 'unstable' and 'experimental',
for i386 and amd64.
Example usage:
# apt-get update && debdelta-upgrade && apt-get upgrade
If run by a non-root user, debs are saved in /tmp/archives : do not
forget to move them in /var/cache/apt/archives
debdelta-upgrade will also download .debs for which no delta is
available (this is done in parallel to patching, to maximize
speed). See the explanation of "debdelta-upgrade --deb-policy" in the
man page for more info and customization on which debs get downloaded.
More info are in README.upgrade
Prepare an APT method so that
'apt-get upgrade' would actually use deltas.
Some code is already written.
It would be nice if debdelta-upgrade would actually choose if
Unfortunately, this decision must depend on a good model
to predict the speed of patching... and this I still cannot
achieve.
debdelta/doc/debdelta_suite.pdf 0000644 0000000 0000000 00000474644 12436652150 013754 0 ustar %PDF-1.4
%
1 0 obj
<< /S /GoTo /D (0.1.1) >>
endobj
4 0 obj
(1. Overview)
endobj
5 0 obj
<< /S /GoTo /D (0.1.1.2) >>
endobj
8 0 obj
(1.1. debdelta)
endobj
9 0 obj
<< /S /GoTo /D (0.1.2.2) >>
endobj
12 0 obj
(1.2. debpatch)
endobj
13 0 obj
<< /S /GoTo /D (0.1.3.2) >>
endobj
16 0 obj
(1.3. debdeltas)
endobj
17 0 obj
<< /S /GoTo /D (0.1.4.2) >>
endobj
20 0 obj
(1.4. debdeltaupgrade)
endobj
21 0 obj
<< /S /GoTo /D (0.1.5.2) >>
endobj
24 0 obj
(1.5. debforensic)
endobj
25 0 obj
<< /S /GoTo /D (0.2.1) >>
endobj
28 0 obj
(2. a delta)
endobj
29 0 obj
<< /S /GoTo /D (0.2.6.2) >>
endobj
32 0 obj
(2.1. the info in a delta)
endobj
33 0 obj
<< /S /GoTo /D (0.2.7.2) >>
endobj
36 0 obj
(2.2. how to apply a delta)
endobj
37 0 obj
<< /S /GoTo /D (0.3.1) >>
endobj
40 0 obj
(3. debdeltaupgrade service)
endobj
41 0 obj
<< /S /GoTo /D (0.3.8.2) >>
endobj
44 0 obj
(3.1. The framework)
endobj
45 0 obj
<< /S /GoTo /D (0.3.9.2) >>
endobj
48 0 obj
(3.2. The goals)
endobj
49 0 obj
<< /S /GoTo /D (0.3.10.2) >>
endobj
52 0 obj
(3.3. The repository structure)
endobj
53 0 obj
<< /S /GoTo /D (0.3.11.2) >>
endobj
56 0 obj
(3.4. The repository creation)
endobj
57 0 obj
<< /S /GoTo /D (0.3.12.2) >>
endobj
60 0 obj
(3.5. size limit)
endobj
61 0 obj
<< /S /GoTo /D (0.3.13.2) >>
endobj
64 0 obj
(3.6. /etc/debdelta/sources.conf)
endobj
65 0 obj
<< /S /GoTo /D (0.3.14.2) >>
endobj
68 0 obj
(3.7. indexes)
endobj
69 0 obj
<< /S /GoTo /D (0.3.14.1.3) >>
endobj
72 0 obj
(3.7.1. indexes of debs in APT)
endobj
73 0 obj
<< /S /GoTo /D (0.3.14.2.3) >>
endobj
76 0 obj
(3.7.2. no indexes of deltas in debdelta)
endobj
77 0 obj
<< /S /GoTo /D (0.3.15.2) >>
endobj
80 0 obj
(3.8. no incremental deltas)
endobj
81 0 obj
<< /S /GoTo /D (0.3.15.3.3) >>
endobj
84 0 obj
(3.8.1. What "incremental" would be, and why it is not)
endobj
85 0 obj
<< /S /GoTo /D (0.3.16.2) >>
endobj
88 0 obj
(3.9. Repository howto)
endobj
89 0 obj
<< /S /GoTo /D (0.3.16.4.3) >>
endobj
92 0 obj
(3.9.1. debmirror debmarshal)
endobj
93 0 obj
<< /S /GoTo /D (0.3.16.5.3) >>
endobj
96 0 obj
(3.9.2. hooks and repository of olddebs)
endobj
97 0 obj
<< /S /GoTo /D (0.4.1) >>
endobj
100 0 obj
(4. Goals, tricks, ideas and issues)
endobj
101 0 obj
<< /S /GoTo /D (0.4.17.2) >>
endobj
104 0 obj
(4.1. exact patching)
endobj
105 0 obj
<< /S /GoTo /D (0.4.18.2) >>
endobj
108 0 obj
(4.2. exact recompression)
endobj
109 0 obj
<< /S /GoTo /D (0.4.19.2) >>
endobj
112 0 obj
(4.3. speed)
endobj
113 0 obj
<< /S /GoTo /D (0.4.19.6.3) >>
endobj
116 0 obj
(4.3.1. some \(old\) numbers)
endobj
117 0 obj
<< /S /GoTo /D (0.4.19.7.3) >>
endobj
120 0 obj
(4.3.2. speeding up)
endobj
121 0 obj
<< /S /GoTo /D (0.4.19.8.3) >>
endobj
124 0 obj
(4.3.3. the 10kb trick)
endobj
125 0 obj
<< /S /GoTo /D (0.4.19.9.3) >>
endobj
128 0 obj
(4.3.4. the choice, the predictor)
endobj
129 0 obj
<< /S /GoTo /D (0.4.19.10.3) >>
endobj
132 0 obj
(4.3.5. State of the art)
endobj
133 0 obj
<< /S /GoTo /D (0.4.20.2) >>
endobj
136 0 obj
(4.4. better deb compression is a worse delta)
endobj
137 0 obj
<< /S /GoTo /D (0.4.21.2) >>
endobj
140 0 obj
(4.5. long time recovery)
endobj
141 0 obj
<< /S /GoTo /D (0.4.22.2) >>
endobj
144 0 obj
(4.6. streaming)
endobj
145 0 obj
<< /S /GoTo /D (0.4.23.2) >>
endobj
148 0 obj
(4.7. format=unzipped)
endobj
149 0 obj
<< /S /GoTo /D (0.4.24.2) >>
endobj
152 0 obj
(4.8. format=preunpacked)
endobj
153 0 obj
<< /S /GoTo /D (0.5.1) >>
endobj
156 0 obj
(5. Todo)
endobj
157 0 obj
<< /S /GoTo /D (0.5.25.2) >>
endobj
160 0 obj
(5.1. todo list)
endobj
161 0 obj
<< /S /GoTo /D (0.5.26.2) >>
endobj
164 0 obj
(5.2. things are getting worse)
endobj
165 0 obj
<< /S /GoTo /D [166 0 R /Fit ] >>
endobj
168 0 obj <<
/Length 1056
/Filter /FlateDecode
>>
stream
xڝV]8}_FHw>kT%oUP $&03}2${5st<˝g>qHSNyrzWKd}*I8m}p%>xH_qe
4IR4v\ci78\_XH
@BE.06h")0>%~FBo]
Rnֶʌʲ1?hcDy,okn
ʏ`0fp>}iii6_/. rbfLГbUeibQ0L(벢ݔ,̿I|y
ׇ{CkM)mTqsn5*$9Sn(uiuov`<߬ᒖ\LJZBV7Z^54VD/3V}Q^:DeI@VtSh+#3Zhn"GoeAaLx