debian/0000755000000000000000000000000012243356617007177 5ustar debian/watch0000644000000000000000000000010412243274472010221 0ustar version=3 https://github.com/sheepdog/sheepdog/tags .*/v(.*).tar.gz debian/sheepdog.templates0000644000000000000000000000236512243274472012721 0ustar # These templates have been reviewed by the debian-l10n-english # team # # If modifications/additions/rewording are needed, please ask # debian-l10n-english@lists.debian.org for advice. # # Even minor modifications require translation updates and such # changes should be coordinated with translators and reviewers. Template: sheepdog/start Type: boolean Default: false _Description: Automatically start the sheepdog service? Please choose whether the sheepdog service should start automatically when the system is booted. Template: sheepdog/daemon_args Type: string Default: _Description: Arguments for the sheepdog daemon: Please choose the command line arguments that should be passed to the sheepdog daemon. If no argument is given, the default behavior is to start on port 7000, using the corosync driver. . Available options include: -p, --port specify the TCP port to listen to -l, --loglevel specify the level of logging detail -d, --debug include debug messages in the log -D, --directio use direct I/O when accessing the object store -z, --zone specify the zone ID -c, --cluster specify the cluster driver More information can be found in the sheep(8) manual page. debian/sheepdog.postrm0000644000000000000000000000020112243274472012232 0ustar #!/bin/sh set -e if [ "${1}" = "purge" ] ; then rm -f /etc/default/sheepdog rm -rf /var/lib/sheepdog fi #DEBHELPER# exit 0 debian/sheepdog.postinst0000644000000000000000000000121112243274472012573 0ustar #!/bin/sh set -e if [ "$1" = "configure" ] ; then . /usr/share/debconf/confmodule mkdir -p /var/lib/sheepdog/ mkdir -p /etc/default if [ ! -e /etc/default/sheepdog ] ; then cp /usr/share/sheepdog/debian-sheepdog-default /etc/default/sheepdog fi if [ -r /etc/default/sheepdog ] ; then db_get sheepdog/start if [ "${RET}" = "true" ] ; then SERVICE_START="yes" else SERVICE_START="no" fi sed -i -e "s/^[ \t]*START=.*/START=\"$SERVICE_START\"/g" /etc/default/sheepdog db_get sheepdog/daemon_args sed -i -e "s/^[ \t]*DAEMON_ARGS=.*/DAEMON_ARGS=\"$RET\"/g" /etc/default/sheepdog fi db_stop || true fi #DEBHELPER# exit 0 debian/sheepdog.links0000644000000000000000000000003712243274472012035 0ustar /usr/sbin/dog /usr/sbin/collie debian/sheepdog.install0000644000000000000000000000006312243274472012362 0ustar debian/debian-sheepdog-default /usr/share/sheepdog debian/sheepdog.init0000644000000000000000000001010412243274472011654 0ustar #!/bin/sh ### BEGIN INIT INFO # Provides: sheepdog # Required-Start: hostname $network $remote_fs $syslog # Required-Stop: $remote_fs # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Short-Description: Sheepdog is a distributed storage system for KVM/QEMU. # Description: Sheepdog is a distributed storage system for KVM/QEMU. It provides # highly available block level storage volumes to virtual machines. # Sheepdog supports advanced volume management features such as snapshot, # cloning, and thin provisioning. The architecture of Sheepdog is fully # symmetric; there is no central node such as a meta-data server. ### END INIT INFO # Author: YunQiang Su # PATH should only include /usr/* if it runs after the mountnfs.sh script PATH=/sbin:/usr/sbin:/bin:/usr/bin DESC=sheepdog # Introduce a short description here NAME=sheepdog # Introduce the short server's name here DAEMON=/usr/sbin/sheep # Introduce the server's location here DAEMON_ARGS="" # Arguments to run the daemon with PIDFILE=/var/run/$NAME.pid SCRIPTNAME=/etc/init.d/$NAME # Exit if the package is not installed [ -x $DAEMON ] || exit 0 # Read configuration variable file if it is present [ -r /etc/default/$NAME ] && . /etc/default/$NAME #FIXME: user cannot give pidfile in /etc/default/sheepdog DAEMON_ARGS="$DAEMON_ARGS --pidfile $PIDFILE" if [ "$START" != "yes" ]; then exit 0 fi # Define LSB log_* functions. # Depend on lsb-base (>= 3.0-6) to ensure that this file is present. . /lib/lsb/init-functions # # Function that starts the daemon/service # do_start() { # Return # 0 if daemon has been started # 1 if daemon was already running # 2 if daemon could not be started start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON --test > /dev/null \ || return 1 start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON -- \ $DAEMON_ARGS $SHEEPDOG_PATH \ || return 2 # Add code here, if necessary, that waits for the process to be ready # to handle requests from services started subsequently which depend # on this one. As a last resort, sleep for some time. } # # Function that stops the daemon/service # do_stop() { # Return # 0 if daemon has been stopped # 1 if daemon was already stopped # 2 if daemon could not be stopped # other if a failure occurred start-stop-daemon --stop --quiet --pidfile $PIDFILE RETVAL="$?" return "$RETVAL" } # # Function that sends a SIGHUP to the daemon/service # do_reload() { # # If the daemon can reload its configuration without # restarting (for example, when it is sent a SIGHUP), # then implement that here. # start-stop-daemon --stop --signal 1 --quiet --pidfile $PIDFILE --name $NAME return 0 } case "$1" in start) [ "$VERBOSE" != no ] && log_daemon_msg "Starting $DESC " "$NAME" do_start case "$?" in 0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;; 2) [ "$VERBOSE" != no ] && log_end_msg 1 ;; esac ;; stop) [ "$VERBOSE" != no ] && log_daemon_msg "Stopping $DESC" "$NAME" do_stop case "$?" in 0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;; 2) [ "$VERBOSE" != no ] && log_end_msg 1 ;; esac ;; status) status_of_proc "$DAEMON" "$NAME" && exit 0 || exit $? ;; #reload|force-reload) # # If do_reload() is not implemented then leave this commented out # and leave 'force-reload' as an alias for 'restart'. # #log_daemon_msg "Reloading $DESC" "$NAME" #do_reload #log_end_msg $? #;; restart|force-reload) # # If the "reload" option is implemented then remove the # 'force-reload' alias # log_daemon_msg "Restarting $DESC" "$NAME" do_stop case "$?" in 0|1) do_start case "$?" in 0) log_end_msg 0 ;; 1) log_end_msg 1 ;; # Old process is still running *) log_end_msg 1 ;; # Failed to start esac ;; *) # Failed to stop log_end_msg 1 ;; esac ;; *) #echo "Usage: $SCRIPTNAME {start|stop|restart|reload|force-reload}" >&2 echo "Usage: $SCRIPTNAME {start|stop|status|restart|force-reload}" >&2 exit 3 ;; esac exit 0 debian/sheepdog.config0000644000000000000000000000056312243274472012166 0ustar #!/bin/sh set -e . /usr/share/debconf/confmodule if [ -r /etc/default/sheepdog ] ; then . /etc/default/sheepdog if [ x"yes" = x"$START" ] ; then db_set sheepdog/start true else db_set sheepdog/start false fi db_set sheepdog/daemon_args "$DAEMON_ARGS" fi db_input medium sheepdog/start || true db_input medium sheepdog/daemon_args || true db_go || true exit 0 debian/sheepdog.bash-completion0000644000000000000000000000010112243274472013771 0ustar script/bash_completion_dog dog script/bash_completion_dog collie debian/rules0000755000000000000000000000403612243274472010260 0ustar #!/usr/bin/make -f # -*- makefile -*- DEBVERS ?= $(shell dpkg-parsechangelog | sed -n -e 's/^Version: //p') VERSION ?= $(shell echo '$(DEBVERS)' | sed -e 's/^[[:digit:]]*://' -e 's/[-].*//') DEBFLAVOR ?= $(shell dpkg-parsechangelog | grep -E ^Distribution: | cut -d" " -f2) DEBPKGNAME ?= $(shell dpkg-parsechangelog | grep -E ^Source: | cut -d" " -f2) UPSTREAM_GIT ?= git://github.com/sheepdog/sheepdog.git GIT_TAG ?= $(shell echo v'$(VERSION)' | sed -e 's/~/_/') %: dh $@ --with autoreconf override_dh_builddeb: dh_builddeb -- -Zxz -z9 override_dh_autoreconf: dh_autoreconf --mode=timesize override_dh_auto_build: dh_auto_build --parallel UNAME := $(shell uname) ifeq ($(UNAME),Linux) ZOOKEEPER=--enable-zookeeper endif override_dh_auto_configure: dh_auto_configure -- ${ZOOKEEPER} override_dh_install: dh_install rm -rf debian/sheepdog/etc/init.d/ dh_bash-completion get-vcs-source: git remote add upstream $(UPSTREAM_GIT) || true git fetch upstream if [ ! -f ../$(DEBPKGNAME)_$(VERSION).orig.tar.xz ] ; then \ git archive --prefix=$(DEBPKGNAME)-$(GIT_TAG)/ $(GIT_TAG) | xz >../$(DEBPKGNAME)_$(VERSION).orig.tar.xz ; \ fi if [ ! -e ../build-area ] ; then mkdir ../build-area ; fi if [ ! -e ../build-area ] ; then cp ../$(DEBPKGNAME)_$(VERSION).orig.tar.xz ../build-area ; fi if ! git checkout master ; then \ echo "No upstream branch: checking out" ; \ git checkout -b master upstream/master ; \ fi git checkout debian/$(DEBFLAVOR) display-po-stats: cd $(CURDIR)/debian/po; for i in *.po ;do \ echo -n $$i": ";\ msgfmt -o /dev/null --statistic $$i ; \ done call-for-po-trans: podebconf-report-po --call --withtranslators --languageteam gen-upstream-changelog: git checkout master git reset --hard $(GIT_TAG) git log >$(CURDIR)/../CHANGELOG git checkout debian/$(DEBFLAVOR) mv $(CURDIR)/../CHANGELOG $(CURDIR)/debian/CHANGELOG git add $(CURDIR)/debian/CHANGELOG git commit -a -m "Updated upstream changelog" override_dh_installchangelogs: dh_installchangelogs $(CURDIR)/debian/CHANGELOG debian/gbp.conf0000644000000000000000000000021712243355205010606 0ustar [DEFAULT] upstream-branch = master debian-branch = debian/unstable upstream-tag = v%(version)s [git-buildpackage] export-dir = ../build-area/ debian/docs0000644000000000000000000000000712243274472010045 0ustar README debian/debian-sheepdog-default0000644000000000000000000000152212243274472013560 0ustar # start sheepdog at boot [yes|no] START="yes" # Arguments to run the daemon with # Options: # -p, --port specify the TCP port on which to listen # -l, --loglevel specify the level of logging detail # -d, --debug include debug messages in the log # -D, --directio use direct IO when accessing the object store # -z, --zone specify the zone id # -c, --cluster specify the cluster driver DAEMON_ARGS="" # SHEEPDOG_PATH # Proper LSB systems will store sheepdog files in /var/lib/sheepdog. The init script uses this directory by default. # The directory must be on a filesystem with xattr support. In the case of ext3, user_xattr should be added to the # mount options. # # mount -o remount,user_xattr /var/lib/shepdog SHEEPDOG_PATH="/var/lib/sheepdog" debian/copyright0000644000000000000000000000301012243274472011122 0ustar Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Contact: MORITA Kazutaka , Liu Yuan Upstream-Name: sheepdog Source: git://github.com/sheepdog/sheepdog.git Files: debian/* Copyright: 2010, Guido Günther 2012, YunQiang Su 2012, Thomas Goirand License: GPL-2 Files: * Copyright: 2009-2011 Nippon Telegraph and Telephone Corporation With upstream authors as folow: 2009-2011, MORITA Kazutaka 2009-2011, FUJITA Tomonori 2009-2011, MORIAI Satoshi License: GPL-2 License: GPL-2 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. . This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. . You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA . On Debian systems, the complete text of the GNU General Public License v2 (GPL) can be found in /usr/share/common-licenses/GPL-2. debian/control0000644000000000000000000000170612243355002010571 0ustar Source: sheepdog Section: admin Priority: optional Maintainer: PKG OpenStack Uploaders: YunQiang Su Build-Depends: debhelper (>= 9), dh-autoreconf, bash-completion, pkg-config, libcorosync-dev, liburcu-dev, libzookeeper-mt-dev [!hppa], libfuse-dev, po-debconf Standards-Version: 3.9.5 Homepage: http://sheepdog.github.io/sheepdog Vcs-Browser: http://anonscm.debian.org/?p=openstack/sheepdog.git Vcs-Git: git://anonscm.debian.org/openstack/sheepdog.git Package: sheepdog Architecture: linux-any Pre-Depends: dpkg (>= 1.15.6~) Depends: ${shlibs:Depends}, ${misc:Depends} Recommends: corosync Description: distributed storage system for QEMU Sheepdog provides highly available block level storage volumes that can be attached to QEMU virtual machines. Sheepdog scales to several hundred nodes, and supports advanced volume management features such as snapshots, cloning, and thin provisioning. debian/compat0000644000000000000000000000000212243274472010373 0ustar 9 debian/changelog0000644000000000000000000001225512243355343011051 0ustar sheepdog (0.7.5-1) unstable; urgency=low * New upstream release 0.7.5: Remove patch Makefile-am-CPPFLAGS-instead-INCLUDES, merged to upstream * Use subdir-objects AC option to remove automake warning. * Define EFD_SEMAPHORE if not exists. * Build only for linux-any: eventfd is used. * New policy version 3.9.5. -- YunQiang Su Thu, 21 Nov 2013 11:11:56 +0800 sheepdog (0.7.3-1) unstable; urgency=low * Imported Upstream version 0.7.3 The upstream has also the debian/* so merged with them. * Remove patches merged upstream * Don't source /lib/init/vars.sh any more * Use canonical vcs url and update homepage -- YunQiang Su Sun, 22 Sep 2013 15:32:01 +0800 sheepdog (0.6.0-2~exp1) experimental; urgency=low * Fix another amd64 only assembly in include/logger.h. * Fix some gcc warning about alignment. -- YunQiang Su Thu, 25 Jul 2013 22:56:30 +0800 sheepdog (0.6.0-2~exp) experimental; urgency=low * In lib/logger.c it use assembly only for amd64. -- YunQiang Su Tue, 23 Jul 2013 16:31:16 +0800 sheepdog (0.6.0-1) unstable; urgency=low * Imported Upstream version 0.6.0 * Don't source /lib/init/vars.sh in init -- YunQiang Su Fri, 19 Jul 2013 23:13:49 +0800 sheepdog (0.5.6-2) unstable; urgency=low * Enable sheepfs support: build-dep on libfuse-dev. -- YunQiang Su Fri, 19 Apr 2013 13:12:43 +0800 sheepdog (0.5.6-1) unstable; urgency=low * Debconf templates and debian/control reviewed by the debian-l10n- english team as part of the Smith review project. Closes: #694518 * [Debconf translation updates] * Czech (Michal Simunek). Closes: #696391 * French (Julien Patriarca). Closes: #696801 * Portuguese (Pedro Ribeiro). Closes: #696982 * German (Chris Leick). Closes: #697091 * Spanish; (Camaleón). Closes: #697106 * Japanese (victory). Closes: #697114 * Russian (Yuri Kozlov). Closes: #697179 * Italian (Beatrice Torracca). Closes: #697214 * Galician (Jorge Barreiro). Closes: #697228 * Polish (Michał Kułach). Closes: #697238 * Swedish (Martin Bagge / brother). Closes: #697293 * Danish (Joe Hansen). Closes: #697295 [ Thomas Goirand ] * debian/copyright is now in machine readable format 1.0. * Manages /etc/default/sheepdog in a policy compliant way (eg: the file is copied from /usr/share/sheepdog in postinst, and isn't a conffiles). * Removes Guido Guenther from uploaders as requested. * Uses debhelper 9 and compat 9, so there's hardening build flags. * More permisive sed call in postinst (eg: allows spaces and tabs before the variable). * Corrects debian/rules get-vcs-source to work like with the rest of the openstack packages. * Using xz as compression. * Added upstream changelog (out of a "git log" output). [ YunQiang Su ] * Import the new upstream tag 0.5.6 (Closes: #671466). * Create master branch act as upstream branch. * Add Simplified Chinese (zh_CN) translation for debconf. * Use pidfile to ensure service stop, remove the former hack. -- YunQiang Su Fri, 08 Feb 2013 11:04:58 +0800 sheepdog (0.5.4-1) unstable; urgency=low * Add debian/watch; remove debian/gbp.conf * Imported Upstream version 0.5.4 * Recommends corosync but not dependends now * Remove 0001-Skip-the-tests.patch and 0002-Add-collie-manpage.patch: in upstream tarball now * Use dh_autoreconf instead of running autogen.sh in rules * Add liburcu-dev to build depends * Add debconf and zookeeper support * Start service default * Bump to stand version 3.9.4; Priority to optional * Set PKG OpenStack as maintainer Set YunQiang Su and Guido Guenther as Uploaders. -- YunQiang Su Wed, 24 Oct 2012 11:01:24 +0800 sheepdog (0.3.0-3) unstable; urgency=low * [907284e] Make sheepdog linux-any since configure bails out on non linux architectures. * [c87c560] Add collie manpage. Thanks to Jens WEBER -- Guido Günther Wed, 04 Jul 2012 22:47:41 +0200 sheepdog (0.3.0-2) unstable; urgency=low * Upload to unstable * [eb4b22d] Use override instead of --before and --after to simplify the build logic and avoid the deprecation warning -- Guido Günther Wed, 04 Jul 2012 14:54:21 +0200 sheepdog (0.3.0-1) experimental; urgency=low [ Guido Günther ] * [9d6c032] Add gbp.conf * [3cf6e61] Bump Standards version * [1a4ec06] Add missing $remote_fs dependency * [450ea59] Install collie bash completion [ Jens Weber ] * [7d300a2] Add init script * [7ac0976] Skip the tests for now since they don't run from within pbuilder -- Guido Günther Sun, 24 Jun 2012 21:41:01 +0200 sheepdog (0.2.4-1) experimental; urgency=low * New upstream version * [e0f4d8a] Don't install init.d script -- Guido Günther Mon, 14 Nov 2011 15:35:17 +0100 sheepdog (0.2.0~0.git452bc6-1) unstable; urgency=low * Initial release (Closes: #606134) -- Guido Günther Tue, 07 Dec 2010 11:06:30 +0100 debian/CHANGELOG0000644000000000000000000257776712243274447010450 0ustar commit beea6156c9b730c8f3d81ee898273f2df7837cc8 Author: MORITA Kazutaka Date: Mon Nov 5 18:26:46 2012 +0900 sheepdog 0.5.4 Signed-off-by: MORITA Kazutaka commit 6a15cbc1dc13bda5a6055205c01a5be47be4624b Author: MORITA Kazutaka Date: Tue Oct 30 09:36:20 2012 +0900 sheep: add support for using unix domain socket This patch adds support for a unix domain socket for a connection between qemu and local gateway. You can use this feature with the following syntax: $ qemu sheepdog:unix:: On my environment, the IOPS is improved from 3700/s to 4100/s against one local sheep with 4 KB buffer size. Signed-off-by: MORITA Kazutaka commit 68185f418901712fe7d2e96971a6ac810f3b8fbb Author: Liu Yuan Date: Mon Nov 5 16:14:38 2012 +0800 test: fix spurious failture of 001 002 Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 362aba337b5c47b58b94e63e8d8a148a95a7b0e2 Author: MORITA Kazutaka Date: Sun Oct 28 17:44:00 2012 +0900 vditest: add '-C' option to limit the maximum number of requests Currently, '-a' option generates too many I/O requests and causes ENFILE error in sheep. This adds '-C' option to limit the maximum number of concurrent I/O requests. This patch also removes all synchronous I/O operations to simplify codes. Signed-off-by: MORITA Kazutaka commit 4bb02bcb90f0adc6c3f3ffd2b322d34609e35409 Author: Liu Yuan Date: Sat Nov 3 23:09:45 2012 +0800 store/plain: move flag operation into get_open_flags() Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 1f7538104e59d24cabd1e8e2b9ccd58819433a5d Author: MORITA Kazutaka Date: Sun Oct 28 12:33:21 2012 +0900 sockfd_cache: guard fds_in_grow with atomic operations check_idx can be called by multiple threads at the same time. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 0d37b5c6394949c4b32618ace56a1c4598ccc8cb Author: MORITA Kazutaka Date: Sun Oct 28 12:16:30 2012 +0900 introduce uatomic_bool This is a wrapper for complex atomic_cmpxchg. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit d60cc20cb1061a8720cda8a16762c970f6a23efa Author: MORITA Kazutaka Date: Sun Oct 28 02:19:18 2012 +0900 fix compile error message when __SIZEOF_POINTER__ is not defined Signed-off-by: MORITA Kazutaka commit f23aafd0a6e820cb8cbdfd23c916660f7f20a81d Author: MORITA Kazutaka Date: Sun Oct 28 02:16:20 2012 +0900 lib: add support for GNU Flymake Signed-off-by: MORITA Kazutaka commit 795889ed1c822b7b8f18c4131dd1f8506835a771 Author: MORITA Kazutaka Date: Fri Oct 26 12:03:09 2012 +0900 avoid strcpy and strncpy strcpy has a risk of buffer overflow, and strncpy may not set '\0' to the destination buffer. This patch introduces pstrcpy to copy strings safely. Signed-off-by: MORITA Kazutaka commit 10eaac80a3d7c5d48a87cb5300d4babb12b8a918 Author: MORITA Kazutaka Date: Fri Oct 26 09:54:46 2012 +0900 collie: send no data when preallocating data objects Sheep calls fallocate before creating objects, so we don't need to send actual data to be written when preallocating objects. Signed-off-by: MORITA Kazutaka commit ec58ad9164d791f36039b41c6a24ca330f72e0a9 Author: Hitoshi Mitake Date: Sun Oct 28 16:08:24 2012 +0900 sheep: set the length of response in local_get_store_list() The commit: 55a84a160dca3c8ed656e21d85fbe93c82e731a9 reduced the setting of default length of response, so the result of local_get_store_list() wasn't sent to collie. Current output of "collie cluster format -b asdf" is: using backend asdf store Format failed: Targeted backend store is not found Available stores: --------------------------------------- Correct output is: using backend asdf store Format failed: Targeted backend store is not found Available stores: plain farm --------------------------------------- This patch let local_get_store_list() set the length of response. Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit 64c87bc642a953f5bd3d51bdffd45995e8a5ad07 Author: Hitoshi Mitake Date: Fri Oct 26 16:45:28 2012 +0900 collie: remove invalid fallthrough in option parser of vdi The fallthrough in vdi_parser() seems a bug. Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit 8b75d5cc8014fd4befeea850c8cf3d3beafeddff Author: MORITA Kazutaka Date: Wed Oct 24 01:07:37 2012 +0900 sheep: avoid busy loop in do_client_tx When short write happens, we should wait for connection to be ready for write in epoll_wait rather than doing busy loop in the main thread. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 0479554d77a47df99bf2897b51675cacc46b4ef6 Author: MORITA Kazutaka Date: Mon Oct 22 02:25:53 2012 +0900 use xcalloc for xzalloc implementation Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 93bb90275813d741ffde8787ee8a122dfc2b2296 Author: MORITA Kazutaka Date: Sun Oct 21 04:25:39 2012 +0900 sheep: initialize only used write_info Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit d101a862f4a0f0d0524dbb9a0a0f6baba44fafbb Author: MORITA Kazutaka Date: Sun Oct 21 11:27:16 2012 +0900 logger: check sheep process after prctl is called The sheep process could exist before setting a signal handler, so we need to check the aliveness of the process after prctl call. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 1ecd7985977170f7a4b98e6c4ef1783eae26f282 Author: MORITA Kazutaka Date: Sun Oct 21 03:17:49 2012 +0900 logger: increase shared memory size from 1MB to 32 MB This reduces the risk of dropping log message by increasing the buffer size from 1 MB to 32 MB, which is the default SHMMAX on many OSes. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit df0fa0d404bdee679db3c8895a22c9e21140d1bd Author: MORITA Kazutaka Date: Sun Oct 21 02:43:28 2012 +0900 logger: format output string before locking Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit f48247c585b49404f3662c828b60f560cab0dd0f Author: MORITA Kazutaka Date: Sun Oct 21 02:22:28 2012 +0900 logger: cleanup logarea and logmsg Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 8dcc954f9ddb959ec155380e490398bfbfd793a4 Author: MORITA Kazutaka Date: Sun Oct 21 01:59:24 2012 +0900 logger: format time in logger process Calling localtime_r is a bit expensive operation, so it's better to call it in the logger process rather than in eprintf/dprintf. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit b17caf8566ef1ac6e0cf6a907ca1e255dd353cfb Author: MORITA Kazutaka Date: Sat Oct 20 03:12:48 2012 +0900 logger: use malloced memory for logger buffer la->buff is not shared between sheep and logger processes, so there is no reason to use a shared memory for it. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit a9c42dd384db1831f10d58cd76a67a7c2a182891 Author: MORITA Kazutaka Date: Sat Oct 20 03:10:54 2012 +0900 logger: remove logdbg We are unlikely to use these debug info. Let's remove them to simplify code. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 07df5cf3c51d338068eeb9c022a0eaf5cff07ec2 Author: MORITA Kazutaka Date: Mon Oct 22 02:49:51 2012 +0900 vditest: limit the maximum number of outstanding aio When using '-a' option, vditest sends too many requests to sheep and causes a 'too many open files' error. This patch limits the number of aio requests as a workaround. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 9efb325eea712f0cab4bc511f882ca343eb4ace3 Author: MORITA Kazutaka Date: Wed Oct 17 17:51:45 2012 +0900 vditest: add option to use a local file for testing This is useful to compare performances between sheepdog vdi and local file. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit e97ea84d6b6ce1e2fad4c21299647e02afbc5acd Author: MORITA Kazutaka Date: Wed Oct 10 11:16:19 2012 +0900 vditest: enlarge maximum buffer size from 1 MB to 64 MB Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 73ef465b4e4782dad591ee7bc3b9da0ee2269d4c Author: MORITA Kazutaka Date: Tue Oct 9 16:03:18 2012 +0900 vditest: add support '-h' option to show performance statistic Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 23322c5bd2f558a1ddc663dc823c319f316d5350 Author: MORITA Kazutaka Date: Tue Oct 9 14:26:19 2012 +0900 vditest: add '-T' option to specify runtime Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 7b29d8fc8babeee3f3ea6d1e105d4d41c5dfbb40 Author: MORITA Kazutaka Date: Tue Oct 9 13:38:14 2012 +0900 vditest: add '-r' and '-w' options to set read and write ratio easily This also makes read-only test default like disktest. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 618790764b786fc4b8c86a1812d2a30ff3e8925c Author: MORITA Kazutaka Date: Tue Oct 9 13:05:43 2012 +0900 vditest: allow specifying vdi name before command line options Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit f815d5422cbbb0c0ef4e8232fd6487b2d9a5a49f Author: MORITA Kazutaka Date: Tue Oct 9 12:56:04 2012 +0900 vditest: add verbose mode Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit c2c840a4909174e9fe6ed196394a7fc178ae72a3 Author: MORITA Kazutaka Date: Tue Oct 9 12:05:52 2012 +0900 vditest: add -f option to specify flush interval Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 175f8fc8c093df6bac3e9507bb0acf999179e001 Author: MORITA Kazutaka Date: Tue Oct 9 12:03:03 2012 +0900 vditest: use qemu-io '-t' option to specify cache mode Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 67c23bce3a3e6b177c8c9f26dede86d0868828c4 Author: Hitoshi Mitake Date: Fri Oct 19 20:21:39 2012 +0900 collie: "node kill" needs node list before its execution "collie node kill" needs node list before its execution. It seems that the flag SUBCMD_FLAG_NEED_NODELIST is eliminated in the commit: 8555a67353e87b0292e90a5d59e3d60a26f6eace After this commit, collie node kill exits with an error like this: Invalid node id '0' Cc: levin li Cc: Liu Yuan Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit c176cedcb59fb830b32a6d87565896850564c047 Author: MORITA Kazutaka Date: Thu Oct 18 17:21:19 2012 +0900 don't set hdr->epoch for non peer requests The field is used only for internal protocols. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 9779610cb656033da5b970c5dfed962e0638d051 Author: MORITA Kazutaka Date: Thu Oct 18 17:23:15 2012 +0900 sheep: fix recover complete notification hdr->epoch cannot be used to notify a recovered epoch because the field is updated in queue_request. This patch uses hdr->obj.tgt_epoch instead. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit a52bd41cc4bdccde3edd6a4e844563479212524c Author: MORITA Kazutaka Date: Thu Oct 18 04:51:32 2012 +0900 add const where possible Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit d6bf5c4348e1f43161a17017444fbe75eb112a89 Author: MORITA Kazutaka Date: Thu Oct 18 04:00:28 2012 +0900 fix coding sytle errors detected by checkpatch.pl Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 18145b931638c6a836af327ba0500ef2ebf71a85 Author: MORITA Kazutaka Date: Thu Oct 18 03:48:10 2012 +0900 simplify exec_req We can know how many bytes should be read and written from sd_req header, so we don't need to pass the size with rlen and wlen. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit db9ab0331f95256048cec603d0ccd8f50ff9b62b Author: Liu Yuan Date: Wed Oct 17 16:44:57 2012 +0800 tests/011: fix random suprious fail Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit ec77520c1c9013af44fc065c1c76b2c09a9c374a Author: MORITA Kazutaka Date: Sat Oct 13 22:56:40 2012 +0900 sheep/plain_store: fix error path in default_create_and_write Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit e06eef244ce039c255db67c8783e44c19a146b9f Author: MORITA Kazutaka Date: Wed Oct 17 03:04:27 2012 +0900 tests: fix test results of 039 and 041 Signed-off-by: MORITA Kazutaka commit 8eed96fcdb55d3aa72649ce390a93839a1628186 Author: MORITA Kazutaka Date: Wed Oct 17 03:04:26 2012 +0900 allow creating snapshot when there is no current image This fixes vdi_rollback and vdi_restore. Signed-off-by: MORITA Kazutaka commit eb20d3fe006825e3bca1a3ca31d52ab0f87b3e52 Author: MORITA Kazutaka Date: Wed Oct 17 03:04:25 2012 +0900 use req->vdi.snapid to specify whether snapshot is created or not SD_OP_NEW_VDI operation uses req->vdi.snapid only to check whether sheep should create a snapshot or not, so setting req->vdi.snapid to snapshot id is wrong. This patch uses a boolean variable for it instead. Signed-off-by: MORITA Kazutaka commit 5d88f35a5977dd5f8ad0e77531a58818e389d38e Author: MORITA Kazutaka Date: Tue Oct 16 16:32:44 2012 +0900 sheep: return error when reading snapshot without snapid or tag Signed-off-by: MORITA Kazutaka commit e71d44a8c0a8a9d87deafb08ea65ea6177d74da0 Author: MORITA Kazutaka Date: Tue Oct 16 16:32:43 2012 +0900 tests: add test to check vdi deletion Signed-off-by: MORITA Kazutaka commit 4ee3d4b0dc203a86bb4b87aeddbc5983d7749377 Author: MORITA Kazutaka Date: Tue Oct 16 16:32:42 2012 +0900 tests: show description even when not running tests This is useful when we select tests to be run. Signed-off-by: MORITA Kazutaka commit 45284e5ed9b3cb0cf4e500f6834b2eec9074b298 Author: MORITA Kazutaka Date: Mon Oct 15 03:35:25 2012 +0900 sheepdog 0.5.3 Signed-off-by: MORITA Kazutaka commit e02df0689b3295e7d8d36b555aa236cae42b616d Author: MORITA Kazutaka Date: Fri Oct 12 04:30:01 2012 +0900 collie: exit with an error when there are too many arguments Signed-off-by: MORITA Kazutaka commit 5f026ac84c3e80114a23eb021b4f0f5c3aaa978a Author: MORITA Kazutaka Date: Wed Oct 10 12:42:09 2012 +0900 sheep: fix buffer overflow of gateway_forward_request sheep forwards requests to all nodes when flushing objects, so the maximum number of target nodes is not SD_MAX_COPIES but SD_MAX_NODES. Signed-off-by: MORITA Kazutaka commit e67dc77265de963d966a0fc627c58de851756714 Author: MORITA Kazutaka Date: Wed Oct 10 11:17:57 2012 +0900 sheep: enable disk cache when creating objects Signed-off-by: MORITA Kazutaka commit 1e4f53969d77d82912d55ce933efe8e5805cd05e Author: MORITA Kazutaka Date: Tue Oct 9 11:55:55 2012 +0900 tests: retry collie node list when sheep is not ready Signed-off-by: MORITA Kazutaka commit 0e38c37894fd646a73fae30a5105d3b920f8ade4 Author: MORITA Kazutaka Date: Tue Oct 9 01:29:04 2012 +0900 sheep: handle older version client correctly SD_PROTO_VER is a protocol version between sheep and client, so the check of SD_PROTO_VER_TRIM_ZERO_SECTORS must be in gateway_read_obj, not peer_read_obj. Signed-off-by: MORITA Kazutaka commit a84808eeea9b49344543fdfe51ee127d00829a11 Author: MORITA Kazutaka Date: Mon Oct 8 12:01:46 2012 +0900 sheepdog 0.5.2 Signed-off-by: MORITA Kazutaka commit 615aaaf9474cc6ddae44047914502dee062932e0 Author: MORITA Kazutaka Date: Sat Oct 6 23:40:31 2012 +0900 fix sparse errors Signed-off-by: MORITA Kazutaka commit 5a3168e4bc800a89a8ad838a58e5c3cc28f0482c Author: MORITA Kazutaka Date: Sat Oct 6 22:51:35 2012 +0900 add support for sparse check Signed-off-by: MORITA Kazutaka commit 2fdaf4e7a19016bc90fc5a680fbafeaa5cb6e51f Author: MORITA Kazutaka Date: Sat Oct 6 23:00:44 2012 +0900 use offsetof in stddef.h Current code doesn't use pre-defiend compiler macro for offsetof because __compiler_offsetof is not defined anywhere. This patch includes stddef.h, which defines offsetof with __builtin_offsetof, rather than redefining it. Signed-off-by: MORITA Kazutaka commit 690cf3b9622dc4ddff9be89a32dac6c5870ad021 Author: MORITA Kazutaka Date: Fri Aug 31 12:55:25 2012 +0900 use bool for boolean variables This improves code readability. Signed-off-by: MORITA Kazutaka commit bd21c597f63a0830164b01da487bd3078a86b1aa Author: MORITA Kazutaka Date: Sun Sep 30 02:27:17 2012 +0900 trim redundant zero bytes of network and disk I/O data This will save a lot of network and disk I/Os especially when recoverying sparse objects. This updates the protocol version between sheep and other programs, but the older one is also supported. Signed-off-by: MORITA Kazutaka commit 05c96ada8c859c1207a795042ac2ea11efa2ac29 Author: MORITA Kazutaka Date: Sun Sep 30 18:01:40 2012 +0900 make sd_rsp 64 bit aligned Without this patch, the header size will end up being changed if we add a 64 bit field to sd_rsp->obj or sd_rsp->vdi. Signed-off-by: MORITA Kazutaka commit 06050d1fef2038504cb0047b12a69a55427691e6 Author: MORITA Kazutaka Date: Sun Sep 30 12:32:56 2012 +0900 sheep: use ftruncate instead of writing to the last sector This also moves err_to_sderr to outside prealloc because we cannot know the oid of the file inside prealloc. Signed-off-by: MORITA Kazutaka commit 8dd17a891ca067f4a91a705b7681c834a7a2811e Author: MORITA Kazutaka Date: Fri Oct 5 05:08:17 2012 +0900 migrate: create backup before upgrading store Upgrading cluster can break store layout if the upgrade process fails. To avoid the worst case, this patch creates backup files before beforehand. Signed-off-by: MORITA Kazutaka commit 6a3c75728924b701946e24ec6eb0257d781f3087 Author: MORITA Kazutaka Date: Fri Oct 5 03:49:23 2012 +0900 migrate: update config file with a new format version Signed-off-by: MORITA Kazutaka commit 1fc84b2a833777bf55c92bab81d18c1a461732e3 Author: Hitoshi Mitake Date: Fri Oct 5 20:51:12 2012 +0900 sheep: new assert() for better error messages This patch removes direct usage of assert() in sheep. assert() in daemon process causes problem because assert() writes its error message to stderr. In addition, typical daemon processes including sheep close stderr and dup() other fd. Actually, logger process of sheep dup() /dev/null for stderr. From my experience, using assert() directly in daemon processes makes debugging hard. So this patch adds assert() as a wrapper of panic(). With this patch, assert() can find suitable output fd for error messages with vprintf() and the error messages can appear in a log file of sheep. Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit 2bbe2734fb2ef3d0c544d21fab026e832d825553 Author: MORITA Kazutaka Date: Sun Sep 30 10:06:48 2012 +0900 sheep: write config data when config file is zero byte If sheep is killed just after config file is created but before no bytes is written, the size of the config file would be zero byte. In this case, we should recreate config file rather than exiting with a version error. Signed-off-by: MORITA Kazutaka commit 80ceb7f496a7be249fb0423d141b3245d7228e29 Author: MORITA Kazutaka Date: Tue Oct 2 09:14:42 2012 +0900 sheepdog 0.5.1 Signed-off-by: MORITA Kazutaka commit e689b67ff4e43bed2c3e402e6adb761d9630f179 Author: MORITA Kazutaka Date: Wed Sep 26 04:40:05 2012 +0900 sheep: add support for upgrading sheep store This adds a '-u' option to sheep usage. If we start a sheep daemon with the option, it will upgrade the underlying store layout if necessary. Signed-off-by: MORITA Kazutaka commit cbc69dcdac9cf43d2884fce390991e370e508d7c Author: MORITA Kazutaka Date: Wed Sep 26 02:58:51 2012 +0900 sheep: check sheepdog data layout version before starting up This avoids data store corruption when there is no compatibility between the running sheep and the existing data. The format version of 0.4.0 is 0, and one of 0.5.x is 1. Signed-off-by: MORITA Kazutaka commit f94db9251156a1d3db9df30fc43a383b8dbb08e9 Author: Bryan D. Payne Date: Tue Sep 25 09:25:44 2012 -0700 Add command line option for binding to a specified interface. Currently sheep will try to bind to all interfaces. This patch provides a command line option allowing you to specify the ip address of the interface where you could like sheep to bind. If you don't provide this option, sheep continues binding to all interfaces as usual. Signed-off-by: Bryan D. Payne Signed-off-by: MORITA Kazutaka commit d4f424367b50ba89339127a96b9a3f13e046031f Author: Hitoshi Mitake Date: Wed Sep 26 22:19:48 2012 +0900 sheep: fix a little bug of is_master() in corosync.c Current is_master() in sheep/cluster/corosync.c has a possibility of reading array with invalid index. This patch fix the little bug. Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit 3739eb21422a6ee5898444b97de3d1b32ab2de40 Author: Liu Yuan Date: Wed Sep 26 14:11:13 2012 +0800 sheep: refine the object cache size value range sys->object_cache_size is uint32, we shouldn't use -1 to indicate invalid value. Instead, we allow 1 to UINT32_MAX inclusive by using 0 to indicate invalid value Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 6c8f6e96ee90cec3b3aa23c6e0ae50227eecee6b Author: Liu Yuan Date: Tue Sep 25 11:29:55 2012 +0800 sheep: fix object cache size setting Use megabytes as unit and allow users to spefify size ranging from 1 ~ 4294967295 megabytes for object cache. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit a119190896a08451636f0cb324f192810176644d Author: MORITA Kazutaka Date: Fri Sep 21 23:44:04 2012 +0900 sheepdog 0.5.0 Signed-off-by: MORITA Kazutaka commit 879cfb18c8d8ba8ccd38554c0ecbdf172c0197d3 Author: MORITA Kazutaka Date: Fri Sep 21 04:30:34 2012 +0900 sheep/plain: set offset correctly in create_and_write Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 239cf324083153003f3420407dffa5a153443c95 Author: MORITA Kazutaka Date: Fri Sep 21 04:30:05 2012 +0900 tests: add test to check partial read and write Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 774bcbdcab53dbdf36d9f85a17d980e0cc8ea3b2 Author: MORITA Kazutaka Date: Thu Sep 20 20:47:26 2012 +0900 introduce SD_FLAG_CMD_DIRECT to bypass object cache Currently, collie commands also use object cache if it is enabled. This causes problems because sheep doesn't assume that other nodes also creates cache data of the same opened VDIs. If we allow it, data inconsistency problems happen when another node updates the cached objects. This patch forces collie commands to bypass object cache to avoid the problem, but vdi read and write still use object cache because they are used to emulate VM I/Os. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit c6aadcbfb8fd241df1e1fd73a792a5adee3ad8fe Author: Liu Yuan Date: Wed Sep 19 13:47:14 2012 +0800 tests: add a test to test vdi snapshot functionality concurrently This patch will fail against current master branch because of object cache regression, which is expected to be fixed by later patch. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 6dfd0c3ec7803952e6f6731a084af2b664c602ff Author: MORITA Kazutaka Date: Tue Sep 18 21:42:56 2012 +0900 tests: omit mkfs outputs Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 9701af44493e760994c68bfb0dfbafc1748c1fd3 Author: MORITA Kazutaka Date: Tue Sep 18 21:36:41 2012 +0900 tests: filter repeated lines to avoid double umount call Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit d9de57b8bfa148757c1205677c2e903bdf125469 Author: MORITA Kazutaka Date: Tue Sep 18 21:04:59 2012 +0900 tests: append log outputs instead of overwriting Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit c4c2940e876fd9b899f9960e185808d776a01583 Author: MORITA Kazutaka Date: Tue Sep 18 20:13:17 2012 +0900 lib/event: initialize variable This fixes a valgrind error on debian wheezy i686. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit e41d7a120d07bfb5beb859395568c9b7dfdfbda0 Author: MORITA Kazutaka Date: Tue Sep 18 19:54:48 2012 +0900 sheep: fix printf format errors on 32 bit machines Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 528d674a0f76381b52e5d6bc2b3932783fa6d075 Author: MORITA Kazutaka Date: Tue Sep 18 21:35:48 2012 +0900 sheep: avoid atomic operations against uint64_t/int64_t variables On 32 bit architectures, atomic operations against 64 bit integers are not allowed. This patch uses more proper types for them. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 24c742bff0de81cc643eb9a6831394b8bb4b160c Author: Liu Yuan Date: Tue Sep 18 21:02:26 2012 +0800 tests/010: fix spurious failure on small disks Let all sheep start with the same disk size. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 05d359210e9376821d53ae446e3f959fa7d57b7f Author: MORITA Kazutaka Date: Tue Sep 18 17:49:55 2012 +0900 sheep: initialize buffer in recover_object_from_replica This is necessary to call free safely when the target object is in the local node. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 13341522bbdfac8c44aec8bc489906b11de71dae Author: MORITA Kazutaka Date: Tue Sep 18 17:43:49 2012 +0900 cluster/local: handle leave events A cluster driver needs to call event handlers even after leave_cluster is called. This patch fixes it. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit bc9c5b846f4a1a1fc5789e48f158863586458de2 Author: MORITA Kazutaka Date: Tue Sep 18 17:43:48 2012 +0900 tests: add test to check disk error Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit acef334850cf7737a5a66687dc3837c8635a1af3 Author: Liu Yuan Date: Mon Sep 17 19:08:22 2012 +0800 plain: fix prealloc() errno This will fix the spurious failure of tests/042. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 566383c5a8ce11e748e9975cb6f12d6230ced9c4 Author: Liu Yuan Date: Mon Sep 17 18:38:55 2012 +0800 sheep: don't exit at EIO We don't work with local driver yet because it doesn't handle LEAVE correctly, but I have tested with corosync, it works: the failed node can act as a gateway node seamlessly. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit ab1d7a7e17fcf6ed0770d9b7c7b18d57d4f99bae Author: Liu Yuan Date: Mon Sep 17 10:42:24 2012 +0800 recovery: reorder recover_object_from_replica() Move valloc() after store->link() operation will save us one glibc call for recovering local objects. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit a2288a6296ea7034824738ab17e0ad905bc6a6c3 Author: MORITA Kazutaka Date: Mon Sep 17 08:00:02 2012 +0900 update bash_completion file Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 48b8b0202294c03f38949895c78ae37501f54f52 Author: MORITA Kazutaka Date: Mon Sep 17 08:00:01 2012 +0900 store/plain: unlink temporary files when create_and_write fails Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 5ddb47152c51c48a5d00d0c5e5eacb630166cd5d Author: MORITA Kazutaka Date: Mon Sep 17 08:00:00 2012 +0900 sheep: handle no space error of object I/Os This handles no space error of object I/Os. Handling this error when writing a config or epoch logs is left for future work. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 0a56f759704b745ef079d35bc7bd44e318793656 Author: MORITA Kazutaka Date: Mon Sep 17 07:59:59 2012 +0900 tests: add test of disk full handling Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit fb31eb0f37c368b0c2baf80ec720d9a2b0d68b6b Author: MORITA Kazutaka Date: Mon Sep 17 07:59:58 2012 +0900 tests: introduce helpers to create loopback devices This prepares for the next patches Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit ddd4e3dcf635260fd2b0ee1f00c345baa538ed35 Author: MORITA Kazutaka Date: Mon Sep 17 07:59:57 2012 +0900 sheep: exit program when EIO occurs When EIO occurs, a sheep is expected to become a gateway node, but it doesn't work now. To fix this, we need to support changing virtual nodes without stopping sheeps. This patch makes it a future work, and stops the program simply on the EIO error. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit f7aa988132b8248d600552ecc714870efaf0505b Author: MORITA Kazutaka Date: Mon Sep 17 07:59:56 2012 +0900 collie: use stdout and stdin for vdi backup and restore This helps us to use backup/restore via ssh or pipe. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit d0b10fb238d239799504ce7a8f7ea218b74abd1f Author: MORITA Kazutaka Date: Mon Sep 17 07:59:55 2012 +0900 sheep: fix vnode_info leak Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit dbda980ad98a78050fc44ea34a473f77f5a11f0a Author: MORITA Kazutaka Date: Mon Sep 17 07:59:54 2012 +0900 sheep: fix double put_vnode_info call If cluster status is not SD_STATUS_OK after the request is retried, queue_request doesn't set req->vinfo and free_request calls put_vnode_info again. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 31bb9134e30a661155a7f7f4811b052b0c0ded3e Author: MORITA Kazutaka Date: Fri Sep 14 16:57:49 2012 +0900 store/plain: use link to recover local objects It is not safe to rename stale objects to the working directory. When reading objects for recovery, default_read reads the working directory first, and then, searches stale objects. However, if the objects are renamed from the stale directory to the working directory on the node at the same time, the recovery read request fails to find the object. To fix this race condition, this patch uses link/2 to keep stale objects. This fixes test failure of tests/028. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 5b21c11fb9aa7c859ed5b19d18e00fd26fbfe717 Author: MORITA Kazutaka Date: Tue Sep 11 00:19:07 2012 +0900 store/plain_store: use epoch numbers as decimals This makes stale object more human readable because we use decimal numbers to print epochs in other places. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit ccfc305261f1b55f093bff6546240164ab2520a1 Author: MORITA Kazutaka Date: Sat Sep 8 08:40:33 2012 +0900 sheep: remove redundant ternary operator Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit a5773b5acdc7a08903a0739f787d6d477d8a711f Author: MORITA Kazutaka Date: Fri Sep 14 16:10:13 2012 +0900 tests/028: show md5 hash values The VDI contents of this test are not random, so it's better to show their md5 hash values to know which VDIs are corrupted. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 349178182123b601dc0efe873ffd28c4465fb075 Author: MORITA Kazutaka Date: Fri Sep 14 02:33:38 2012 +0900 tests/037: avoid false detection of network partion We shouldn't stop more than half of nodes at the same time. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit a598e93b9bce250301a1bb85f2b3b9a458c5ce0c Author: MORITA Kazutaka Date: Thu Sep 13 18:19:11 2012 +0900 sheep: open files with O_EXCL when creating objects There is a race condition in default_create_and_write_obj because if node membership changes during object creation, gateway and recovery process could send CREATE operations to the same object at the same time. This causes, for example, the following problem. 1. gateway request creates a tmp_path file 2. recovery process also creates the same tmp_path file 3. gateway request renames the tmp_path to a correct path 4. recovery process also tries to rename, but it fails because the tmp_path doesn't exist To avoid this problem, this patch uses a O_EXCL option to create objects. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit a313bd0b07a81b77963c05ccde695a907a7cb50d Author: MORITA Kazutaka Date: Wed Sep 12 12:28:04 2012 +0900 sheep: make create operations atomic Currently, CREATE_AND_WRITE operations are processed even if recovery works of their target objects are on going (commit b3442e66). This causes a problem in the following situation (and it acutually happens with test 008): 1. There are three nodes, A, B, and C. 2. CREATE_AND_WRITE operations are sent to them, and at the same time, a node D is added to cluster. 3. The node D recovers a object from node A, but the object is just being created. The node D reads a zero filled object; it is fallocated, but data is not written yet. 4. The CREATE_AND_WRITE operation is retried to node D, and it is successfully finished. 5. The recovery process on node D overwrite the object with the zero filled data. Instead of adding another complex check in request_in_recovery, this patch makes all CREATE_AND_WRITE operations atomic to keep code simplicity. With this patch, store->write is splited into store->write and store->create_and_write, and store->atomic_put is removed. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 5891a9e896d179196e32bf0fd9a5da7458498952 Author: Liu Yuan Date: Thu Sep 13 11:39:53 2012 +0800 collie: update help message for collie cluster recover Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 63e44d2f0bded14edb83095f3549077bb6f1d114 Author: MORITA Kazutaka Date: Tue Sep 11 15:06:53 2012 +0900 tests: use wait instead of _wait_for_collie When we use -valgrind option, there is a delay before collie starts up. In the worst case, we could call _wait_for_collie before collie starts. To solve the problem, this simply waits for background processes to finish rather than iterating pgrep and sleep. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 98ff23debc4bc79d371923c2b31bc6fc8e312ef7 Author: MORITA Kazutaka Date: Tue Sep 11 14:56:42 2012 +0900 tests: sleep 1 second before unmount Seems that umount just after stopping sheeps fails occasionally. Let's add 1 second sleep before umount again and see how it works. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 1a5ad5ce5b1cdbedf6760cc41c5bafd2e558bd6f Author: Liu Yuan Date: Wed Sep 12 17:13:29 2012 +0800 gateway: clean up gateway_read_obj() - use gateway_init_fwd_hdr() to init forward header. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 51fb3c963f77b82517c5b631f5ba9b31a4ead956 Author: Liu Yuan Date: Wed Sep 12 16:07:13 2012 +0800 sheep: add a helper to send req on top of sockfd cache This will greatly reduce the duplicated and lenghty code. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 74af57f2c5975e2f91168360a83bb96fb26fdf86 Author: Liu Yuan Date: Wed Sep 12 16:07:12 2012 +0800 sheep: use cached sockfd for all connections This will especially boost recovery performance a bit. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit cc540fc2bf29c88340ee24ce3b03e6d00011c326 Author: MORITA Kazutaka Date: Wed Sep 12 16:05:07 2012 +0900 tests: rewrite 010 to test new manual recovery Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit a59a37e9405721f90353542cf12ef1e4b7aa31e3 Author: MORITA Kazutaka Date: Wed Sep 12 18:51:42 2012 +0900 sheep: add support for manual recovery This implements disbling object recovery. When recoery is disabled, sheep will suspend the recovery process after it recovers objects in the prio_oids queue. The suspended recovery is resumed after new objects are pushed into the prio_oids queue. This means that unaccessed objects are not recovered at all. Note that sheep increments epoch even when recovery is disabled. If sheep receives a write request, it will place the updated object based on the current node membership. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 46a03e63a8f566015a14978bd0d2b7025500dc17 Author: MORITA Kazutaka Date: Wed Sep 12 16:05:05 2012 +0900 Revert "sheep: do the real work of dsiable/enable recovery" It requires quite big changes to extend this patch to handle leave nodes. The next patch will implements manual recovery with much simpler approach. This reverts commit a9186fd8f0e133ba7065a579da1062650991e89a. Conflicts: sheep/group.c sheep/sheep_priv.h Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 67a24a7d1d3724ecefdc1893b7ebb748bc04f6e4 Author: MORITA Kazutaka Date: Wed Sep 12 16:05:04 2012 +0900 Revert "collie: add cluster recover info command" With the new manual recovery design, we need to increment epoch even when object recovery is disabled, so the command "cluster recover info" is no longer required. This reverts commit d652f24f00bef5045233083ff704b252a54beffe. Conflicts: include/internal_proto.h sheep/ops.c Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 07455d3bb420dae24e8bdc6d2f993f67302f3ea5 Author: Liu Yuan Date: Tue Sep 11 16:17:11 2012 +0800 sheep: correct join_message->nr_nodes type In send_join_request() we use -1 to check if the epoch read error happens. So we should change uint16_t to int16_t. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 012530dd8f56f0688996f9bbdc7da418385f37e7 Author: Liu Yuan Date: Tue Sep 11 16:17:10 2012 +0800 collie: fix buffer overflow of size_to_str() We might end up with i == ARRAY_SIZE(units), which will cross array boundry. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 1d468715f78af61c87c7e39bf2141de17cd0f540 Author: Liu Yuan Date: Tue Sep 11 14:54:02 2012 +0800 ops: refactor read_copy_from_replica() This function does competely the same as gateway_read_obj(), so we'd better directly call gateway_read_obj() by faking a request. We didn't call it in the old code because a nasty nested requests problem over the same FD. Now with short thread and sockfd, we don't have this kind of problem. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 8d41ed11039120374c8397c55b73a9b45b070135 Author: Liu Yuan Date: Tue Sep 11 11:45:05 2012 +0800 gateway: clean up wait_forward_request() We only process POLLIN events, then we can simplify processing with goto and remove the never-reach code. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 55a84a160dca3c8ed656e21d85fbe93c82e731a9 Author: MORITA Kazutaka Date: Mon Sep 10 22:18:25 2012 +0900 sheep: set response data size to zero by default We tend to forget to set rsp->data_length to 0 when we don't need to send any data, so it looks better not to initialize the value with req->data_length. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit c3f59890bffc643a508df357859c56dcb262fa7f Author: MORITA Kazutaka Date: Mon Sep 10 22:12:38 2012 +0900 tests: add option to exit immediately on test failure This add '-e' option to stop check script immediately on test failure. This is useful when we want to check sheep.log or dumped core files of the failure. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit c82ac95da6af26f8ce6682eef5554a2eb47bacef Author: MORITA Kazutaka Date: Sun Sep 9 01:14:32 2012 +0900 collie: handle snapshot name correctly in fix get_oid Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 139333373782e85272f80ca4226192294e68f3f4 Author: MORITA Kazutaka Date: Sun Sep 9 12:45:40 2012 +0900 sheep: remove tempoary objects only when sheep starts up It is not safe to remove temporary objects while sheep is running. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 472fd228899bbca4f07a311401bf2d8239f5800f Author: MORITA Kazutaka Date: Sun Sep 9 10:04:35 2012 +0900 collie: don't show cluster creation time when collie fails to get it Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 4f63b77691f4859652cbdf6e5b5a8b2fac06b0f5 Author: Liu Yuan Date: Mon Sep 10 15:21:40 2012 +0800 test: fix simulate_machine_down helper use $1 instead of mad $i Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit beec87cc3cd9e90d5540c2457c9c2bec4f618a22 Author: MORITA Kazutaka Date: Mon Sep 10 01:41:34 2012 +0900 tests: add test to check vdi backup and restore Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit ca44e2d56dc6f313a834e89ac157bdfc81c16d03 Author: MORITA Kazutaka Date: Mon Sep 10 01:41:33 2012 +0900 collie: vdi restore support This enables us to restore vdi images with backup files created by 'vdi backup' command. $ collie vdi restore Usage: collie vdi restore [-s snapshot] [-a address] [-p port] [-h] Options: -s, --snapshot specify a snapshot id or tag name -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 62f5ace89c991e5ba90781d3f39d8f414b1d2922 Author: MORITA Kazutaka Date: Mon Sep 10 01:41:32 2012 +0900 collie: add vdi backup support This enables us to create an incremental backup between two snapshots. $ collie vdi backup Usage: collie vdi backup [-s snapshot] [-F from] [-a address] [-p port] [-h] Options: -s, --snapshot specify a snapshot id or tag name -F, --from create a differential backup from the snapshot -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit The created backup file contains differential data for each data objects. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 310b1e261b29e5d28d840c41030a3132bead9d9a Author: MORITA Kazutaka Date: Mon Sep 10 01:41:31 2012 +0900 collie: add helper function to read vdi objects Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit e115a18420028d4bfeeaed40b9019232253f232e Author: MORITA Kazutaka Date: Mon Sep 10 02:06:28 2012 +0900 tests: avoid false detection of network partion If more than half nodes go down at the same time, sheep wrongly detects a network partition failure. This add one second sleep to avoid the problem. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 15df161958a38cf3f7bc83b5bc2c8a1817b3072e Author: MORITA Kazutaka Date: Mon Sep 10 02:06:27 2012 +0900 corosync: handle network partition This patch adds support for network partition detection again with more efficient way. Even when many nodes go down at the same time, corosync dispatches leave messages one by one. It makes it difficult to detect a network partition. To determine whether there are succeeding leave events or not, this patch uses a cpg file descriptor, which is used to check notified corosync messages. If the descriptor is ready for read, sheep skip processing events and tries to receive the next events. If more than half numbers of nodes go down at the same time, sheep determine that a network partition has occurred and stop serving. With this patch, sheep can also detect NIC failure and avoid updating epoch wrongly. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 0f19763622d54192f252c99bdeae4b3aa9a145d5 Author: MORITA Kazutaka Date: Mon Sep 10 02:15:47 2012 +0900 Revert "sheep: let all sheeps with smaller epoch added into delayed_nodes list" This reverts commit 98241c50d23d99047c37a606a845f5f0bccbbf50. We cannot pass test 002 after the commit and it looks difficult to fix the problem. The benefit of the commit is not mandatory, so let's revert it for now and fix it in future. Signed-off-by: Liu Yuan commit 43ffcd8ab63338be523571c2fb732090292ed092 Author: MORITA Kazutaka Date: Mon Sep 10 02:15:46 2012 +0900 tests/002: add correct output file Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 41aa90fcde74cf2df1209f00cd996c8edfb73f76 Author: MORITA Kazutaka Date: Sat Sep 8 11:58:20 2012 +0900 local: update queue in correct order This updates the current queue position after writing queue data. Otherwise, if a sheep process exists during pushing data into the queue, other sheeps can read invalid data. Signed-off-by: MORITA Kazutaka commit 1b03f40c5f13e974aa02a06207f11b5aee3293e5 Author: MORITA Kazutaka Date: Fri Sep 7 17:02:42 2012 +0900 sheep: disable journal by default It seems that most modern disk drives implement atomic sector writes (e.g. section 7.4 in http://www.sqlite.org/atomiccommit.html). If it is true, we don't need to use journal for vdi object updates because we don't update them across a sector boundary. This patch uses atomic_put() for vdi creation, and doesn't use journal for vdi updates by default. Signed-off-by: MORITA Kazutaka commit 36fe1677b6c32d066aecb61eb3780ccfcb4ac3ab Author: MORITA Kazutaka Date: Fri Sep 7 13:03:29 2012 +0900 sheep: fix overflow when reading journal data When writing a whole vdi object, the written data size will be more than (1 << 22). This patch allocates the actual data size to read buffer. Signed-off-by: MORITA Kazutaka commit d8096ac3f460a14a3bb21fe641c148cce22d70d0 Author: MORITA Kazutaka Date: Fri Sep 7 00:42:10 2012 +0900 sheep: fill zero to avoid reading uninitialized data This might be unnecessary, but It is a bit weird to calculate sha1 hashes based on uninitialized data. This also silences valgrind errors. Signed-off-by: MORITA Kazutaka commit 3fe5acb61ae7868278b7809b5626f9c4b3123f81 Author: MORITA Kazutaka Date: Wed Sep 5 23:50:34 2012 +0900 tests: abort test on valgrind error This dumps a core file on a valgrind error to make fixing it much easier. It seems that valgrind complains about some corosync versions. Feel free to add a new section to valgrind.supp to suppress meaningless errors. Signed-off-by: MORITA Kazutaka commit 02d07c4cd926634dcc089b8a27cb1242d6e894c9 Author: MORITA Kazutaka Date: Thu Sep 6 21:06:55 2012 +0900 tests/011: specify file system type This also adds a '-J' option to use small journal size. Otherwise, the virtual nodes calculation becomes a different result from other file system. Signed-off-by: MORITA Kazutaka commit 48c8f0e6ff098c1c6c4bdf6f91c507c144af685c Author: Hitoshi Mitake Date: Fri Sep 7 19:17:03 2012 +0900 sheep: include config.h in plain_store.c for HAVE_SYNCFS config.h is required for determine HAVE_SYNCFS is defined or not. Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit 86c07ccf1b4a6434346f9ecc8d77f4b7d72c68e4 Author: Hitoshi Mitake Date: Fri Sep 7 17:55:02 2012 +0900 sheep: use node_is_local() instead of is_myself() node_is_local() is added to sheep_priv.h, using it instead of is_myself() is a little bit better Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit 0724b59e5dc5d8445607b066fdee8fb64f4b3dae Author: Hitoshi Mitake Date: Fri Sep 7 17:55:01 2012 +0900 trivial: ignore *.patch and tests/*.out.bad in .gitignore Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit 9a83f20f58d4f40bce5e8a4af1a2bc7ef8bd897d Author: MORITA Kazutaka Date: Thu Sep 6 01:30:43 2012 +0900 collie: fix uninitialized values Signed-off-by: MORITA Kazutaka commit d90a807da611ea397162548dbf37a5293c2ecaa8 Author: MORITA Kazutaka Date: Wed Sep 5 18:48:11 2012 +0900 logger: avoid calling strcpy against overlapping string dirname may return a pointer to its argument, so it is safe to use a temporary buffer for strcpy. Signed-off-by: MORITA Kazutaka commit 8d8d3f03f0a37b169a998aec5f263c2a47c0e114 Author: MORITA Kazutaka Date: Wed Sep 5 18:44:13 2012 +0900 sheep: don't send data when read request fails We don't need to send useless data to client when a read request fails. Signed-off-by: MORITA Kazutaka commit f9c548e868e809bb8f6a97161122ba609f7753d5 Author: MORITA Kazutaka Date: Wed Sep 5 17:53:57 2012 +0900 farm: don't call close when we fail to open file Signed-off-by: MORITA Kazutaka commit 0c64f71defe1900a9fc5fa5c14bee0c5866020c2 Author: MORITA Kazutaka Date: Wed Sep 5 21:56:48 2012 +0900 tests: add valgrind support to collie command This patch shows valgrind error messages of collie if they exist. Signed-off-by: MORITA Kazutaka commit 8737cc7d3005337e3b9f0e1644e4877aa081b356 Author: Hitoshi Mitake Date: Fri Sep 7 14:25:54 2012 +0900 tests: modify usage of -w in 018, 019, 020 for new cache option -w is obsolete, so this patch modifies -w usage in 018, 019, 020. Cc: MORITA Kazutaka Cc: Liu Yuan Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit fca13d54f1bc880f83aec7d7d7b2758a7ce4a33d Author: Hitoshi Mitake Date: Fri Sep 7 14:25:53 2012 +0900 sheep: add SD_OP_FLUSH_NODES and SD_OP_FLUSH_PEER for writeback cache semantics This patch adds two new internal sheep operation: SD_OP_FLUSH_NODES and SD_OP_FLUSH_PEER for implementing writeback cache semantics in backend stores. If writeback cache semantics is used in backend stores, explicit flushing in all sheeps is required when gateway sheep receives SD_OP_FLUSH_VDI. After applying this patch, SD_OP_FLUSH_NODES will be queued as a gateway request when sheep receives SD_OP_FLUSH_VDI. SD_OP_FLUSH_NODES forwards SD_OP_FLUSH_PEER to all other sheeps. After receiving the SD_OP_FLUSH_PEER, sheeps flush their cache of backend stores. This patch also modifies command line option of sheep. -w was used for enabling object cache and specyfing size of it. After applying this patch, -w is also used for enabling writeback cache semantics in backend stores. Example of new -w is like this: -w disk ... enable writeback cache semantics of disks -w disk,object:size=50 ... enable writeback cache semantics of disks, and enable object cache with 50MB memory -w object:size=50 ... enable object cache with 50MB memory -w object:size=50:directio ... enable object cache with 50MB memory with O_DIRECT Cc: MORITA Kazutaka Cc: Liu Yuan Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit 71773b235fdafb2652e0f3ad65205128cc320a0e Author: Hitoshi Mitake Date: Fri Sep 7 14:25:52 2012 +0900 sheep: add new operation flush() to store_driver This patch adds new operation flush() to store_driver for flushing underlying cache of storage. flush() is required for enabling disk cache in sheep. This patch also adds default_flush() for farm and plain_store. default_flush() is based on syncfs() (if it is available) or sync(). Cc: MORITA Kazutaka Cc: Liu Yuan Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit 7c7033c43dda856f15fdc2e82f4bcae14c27fbdc Author: Hitoshi Mitake Date: Fri Sep 7 14:25:51 2012 +0900 sheep: change gateway_forward_request() for forwarding requests to all other nodes This patch adds a new boolean parameter all_node to gateway_forward_request(). When this parameter is true, gateway_forward_request() forwards a request to all other nodes instead of vnodes which store the object. Cc: MORITA Kazutaka Cc: Liu Yuan Signed-off-by: Hitoshi Mitake Signed-off-by: MORITA Kazutaka commit f172a2cb48347f52c74bb5704657d917d7aa0fcc Author: levin li Date: Wed Sep 5 16:41:59 2012 +0800 object cache: try to delete cache entris in every node when deleting a VDI Every node that starts VM in writeback mode can create cache entries, so we can not only remove cache entries in the node that executes the VDI deletion work, but should notify to try to delete cache entries on every node Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit a0cd37999758318ea38fb36e1e9ba8a0fd1e970f Author: levin li Date: Wed Sep 5 19:33:56 2012 +0800 sheep: read inode directly from backend in VDI deletion work This patch is preparation for next patch that deletes object cache when deleting VDI, if object cache deletion work is processed in notify, then it may causes race between cache deletion and next VDI deletion work, because VDI deletion work may needs to read its parent VDI inode data, so we can not read the inode data from cache Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 17d104c3a39f7475ae57abb86ad16792378d0f5b Author: MORITA Kazutaka Date: Wed Sep 5 18:37:46 2012 +0900 tests: ensure sheep starts up when using valgrind When using valgrind, there is a delay before sheep starts up. This checks a lock file to ensure that sheep is running. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 6393e8307e1197d77c3076e083b62efd02e1e68a Author: MORITA Kazutaka Date: Wed Sep 5 17:42:11 2012 +0900 sheep: avoid using large stack area This also silences valgrind warnings. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 69f41ed07c8dc20c572d85c612dd0789404b3d51 Author: MORITA Kazutaka Date: Wed Sep 5 13:15:57 2012 +0900 tests: add valgrind support If you add '-valgrind' to command options, check script wraps all sheep calls with valgrind. This makes it easier to debug problems and detect memory leaks. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 38dafedb5f8a2b07d0926514354cacb3142435e3 Author: MORITA Kazutaka Date: Wed Sep 5 16:53:33 2012 +0900 tests/040: wait for sheeps to finish starting up before listing vdis Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 2ce9ecd8346cc93562e7e38103348e907d0f0808 Author: MORITA Kazutaka Date: Wed Sep 5 15:07:33 2012 +0900 tests: ensure that sheeps are not running if necessary This ensures that sheeps are not running when - starting a testcase - unmounting devices - starting a sheep daemon This removes a lot of false test errors. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 385c11727cc9a73776203c87289cdf38e2a3dca6 Author: MORITA Kazutaka Date: Tue Sep 4 08:54:14 2012 +0900 local: add more debug outputs Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 7e14b0b5128503b109f3880b9186b6c3a4835114 Author: MORITA Kazutaka Date: Sun Sep 2 15:56:33 2012 +0900 local: remove blocking event when its sender goes away This avoids a dead lock. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 1cec5523ae0f1ab44b0d82d17555aae00629679e Author: MORITA Kazutaka Date: Tue Sep 4 15:34:03 2012 +0900 local: don't process events until joining to cluster This patch also removes the master node from the local driver. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 886da4d1cb030dc52933d9d0a96270c49a9dd37b Author: MORITA Kazutaka Date: Tue Sep 4 15:32:18 2012 +0900 local: avoid calling sd_block_handler twice Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 6fa2b356251ccbd8576ce09d15a903d51181faee Author: MORITA Kazutaka Date: Sun Sep 2 15:46:58 2012 +0900 local: use a different queue for block operations This is the same change which was done in corosync driver to avoid deadlock of vdi creations. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 9e22557351931b06b43fc06facf3bbe1a73077cd Author: MORITA Kazutaka Date: Mon Sep 3 19:23:45 2012 +0900 local: check process at shorter intervals Some testcases regards that sheep can detect node failure in less than 1 second. This patch changes the interval of process check from 1 second to 200 milliseconds. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 5f910a8e46458f88586a96d0bb615419ec1e04b0 Author: MORITA Kazutaka Date: Tue Sep 4 10:25:58 2012 +0900 local: remove checksum Currently, local driver doesn't update both queue and checksum data atomically, so if the process exits unexpectedly after updating queue and before doing checksum, a newly started process can regard the queue as invalid. This patch removes checksum feature completely from local driver. Though we should introduce something like journaling to update queue safely, it looks overkill because local driver is only for testing purpose. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 9f29e1eab7ed16724c886785ecb0a33ea9557b53 Author: levin li Date: Wed Sep 5 11:13:33 2012 +0800 object cache: fix crash when deleting object cache Signed-off-by: levin li Signed-off-by: Liu Yuan commit 79ef64dc72559b349966f11a8c257ed5a2251084 Author: Liu Yuan Date: Wed Sep 5 12:30:48 2012 +0800 test: fix random spurious failure by using _wait_for_sheep helper Signed-off-by: Liu Yuan commit 8486f40314695dff03588531dedad01c8dadcc09 Author: MORITA Kazutaka Date: Wed Sep 5 08:28:25 2012 +0900 sheep: remove unused variable in cluster_make_fs Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 7d13516fb9dccefd04bc583bd323b6a35852632d Author: MORITA Kazutaka Date: Tue Sep 4 17:28:26 2012 +0900 corosync: give up cpg_initialize after retrying CPG_INIT_RETRY_CNT times It seems that, with corosync 1.x.x, cpg_initialize returns CS_ERR_TRY_AGAIN when the corosync daemon is not running. This patch avoids an infinite loop to limit the retry count. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit ff74357b8ec15f9a1ca758afac7780ee849000e6 Author: MORITA Kazutaka Date: Sun Sep 2 13:01:38 2012 +0900 collie: fix uninitialized variables Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 576f4dc8257cd1291d0f42f3ea59d959a561ad4e Author: MORITA Kazutaka Date: Tue Sep 4 12:16:36 2012 +0900 sheep: check return value of get_vdis_from A newly added node sends SD_GET_VDI_COPIES to only one node, so if the node is not available, the added node cannot get vdi copies info. We should try another node in such case. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 5d807f07c3e7912c4f269c0c7341f801138a5e47 Author: MORITA Kazutaka Date: Mon Sep 3 01:57:11 2012 +0900 store: return SD_RES_NO_OBJ when object size is zero If clients read objects which is just being created, the size of read data can be zero and clients get a SD_RES_EIO error. Sheep leaves cluster when the SD_RES_EIO error happens, so we should return SD_RES_NO_OBJ in this case. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 97ccd87ea15e606b6ec9fecb54f5de453f9c5c1f Author: MORITA Kazutaka Date: Tue Sep 4 10:53:13 2012 +0900 sheep: wait for vdi copies to be updated in recovery process If node membership changes between process_work and process_main of SD_OP_NEW_VDI, we cannot get the vdi copies in recovery process. In that case, we need to wait for post_cluster_new_vdi to finish. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit e3246821de6477ef2a2efce261e200f6d6e9628b Author: MORITA Kazutaka Date: Sun Sep 2 03:25:08 2012 +0900 cluster/corosync: order unblock messsage with confchg events Multicast messages after block operations also must be ordered with confchg events. For example, if an unblock event of SD_OP_NEW_VDI is not ordered with confchg events, post_cluster_new_vdi() may not be called on newly added nodes. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 3a98325a13d4a8e2a43df983f17a12f33f6a6166 Author: MORITA Kazutaka Date: Sun Sep 2 03:10:35 2012 +0900 tests: check message order between vdi creation and object recovery Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 61c3126767bae689322de1d2853eac208a399e97 Author: Liu Yuan Date: Tue Sep 4 20:35:36 2012 +0800 gateway: fix poll() hang Use timeout poll instead of poll+keepalive Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit cb22fea240265dbf479ce2ed2f2f00459d630d60 Author: Liu Yuan Date: Tue Sep 4 18:57:38 2012 +0800 net: fix embryonic connection handling in connect_to() We should set_snd_timeout before calling connect() to avoid embrynoic connection, which may remain in the half-open state for unbounded periods of time (RFC 793). In this way, we don't need to switch to non-blocking sockfd. This prolem can be reproduced by 035. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 020f80696fedeb19f4d94a249293e994e021ea32 Author: Liu Yuan Date: Tue Sep 4 18:57:37 2012 +0800 test: consoliate test 035 to test more bugs - also add two helpers: _simulate_mahcine_done and _cleanup_machine_simulation Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 6d6a8e552224f152c278d2523f34605d942e47a0 Author: MORITA Kazutaka Date: Fri Aug 31 22:16:14 2012 +0900 tests: add test of vdi rollback Signed-off-by: MORITA Kazutaka commit d0ef173512253bad23bbc6b43b0ee86ec8831532 Author: MORITA Kazutaka Date: Fri Aug 31 17:21:38 2012 +0900 collie: add vdi rollback support This adds a new command 'vdi rollback' to rollback to the requested snapshot. Here is an example: $ collie vdi create test 4G $ echo snap1 | collie vdi write test 0 512 $ collie vdi snapshot test -s snap1 $ echo snap2 | collie vdi write test 0 512 $ collie vdi snapshot test -s snap2 $ collie vdi read test 0 512 snap2 $ collie vdi rollback test -s snap1 $ collie vdi read test 0 512 snap1 $ collie vdi rollback test -s snap2 $ collie vdi read test 0 512 snap2 $ collie vdi list Name Id Size Used Shared Creation time VDI id Copies Tag s test 1 4.0 GB 4.0 MB 0.0 MB 2012-09-01 09:29 7c2b25 1 snap1 s test 2 4.0 GB 4.0 MB 0.0 MB 2012-09-01 09:30 7c2b26 1 snap2 test 3 4.0 GB 0.0 MB 4.0 MB 2012-09-01 09:30 7c2b29 1 Signed-off-by: MORITA Kazutaka commit 2984ebd418c0ab06a58bd3d3296a3e327453b05b Author: Liu Yuan Date: Sat Sep 1 23:26:18 2012 +0800 sheep: fix eventfd_read() error handling This is motivated by a crash log: ... Sep 01 21:53:50 [block] suspend(64) going to resume Sep 01 21:53:50 [block] exec_local_req(449) event fd read error Interrupted system call Sep 01 21:53:50 [block] do_process_work(1245) failed: 12, 0 , 1, 8 Sep 01 21:53:50 [main] cluster_op_done(283) LOCK_VDI (0xc7a790) Sep 01 21:53:50 [gway 4] do_process_work(1238) 83, c7c850, 32580 ... that eventfd_read() get EINTR without proper handling and then mess up the next process. - retry read in exec_local_req() (worker thread) - return in enable/disable_tracer() (main thread) Signed-off-by: Liu Yuan commit ce4adbc6074f1b44bf8748ee0366e0046e69e25f Author: Liu Yuan Date: Sat Sep 1 23:26:17 2012 +0800 trace: fix random crash for enable command We should wait for all the worker's signal handler running like old code, or we will get random crash when signal handler is called after patching. Signed-off-by: Liu Yuan commit ca72e0d9f547e49711ae1a71c090130bb3adee64 Author: Liu Yuan Date: Sat Sep 1 23:26:16 2012 +0800 trace: run trace cat as a worker The subtle case is that when tracer is in patching, we should return AGAIN to buffer reader to ask him to try again until patching is down. Signed-off-by: Liu Yuan commit f32b4cb1c318101741843f0a83e0964a1a200098 Author: Liu Yuan Date: Fri Aug 31 23:17:06 2012 +0800 trace: don't do patching while short_thread_running When the code patcher wakes up, there is still chances that short thread is running. We simply return to wait for next eventfd notification until we are safe to do patching. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit f17fb8a3a016f6ccee81fe947b79878d6d0bb777 Author: MORITA Kazutaka Date: Fri Aug 31 19:05:55 2012 +0900 tests: ensure that sheep processes exit after calling pkill After send a signal, there is a delay until sheep exits. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit febe0903516ddb2387ea256b905c72660740f467 Author: MORITA Kazutaka Date: Fri Aug 31 16:34:42 2012 +0900 sheep: save config before logging current epoch If sheep exits after writing epoch info and before saving config in cluster_make_fs(), it will send a wrong config to master and fail to join Sheepdog when it restarts. This patch saves config before logging epoch info to avoid the problem. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 1355e17798788d1a9c48e202de795a4dba8dac4b Author: MORITA Kazutaka Date: Fri Aug 31 16:34:41 2012 +0900 sheep: save store driver name outside storage driver Storage drivers shouldn't care about config file. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit d4b8f315c985c5037410cf6e404eab9def9b9b6f Author: MORITA Kazutaka Date: Thu Aug 30 09:36:17 2012 +0900 sheep: split notification messages into two queues Currently, the corosync driver uses two queues, confchg_list and notify_list. However, this causes problems because some messages (e.g. format, shutdown, cluster snapshot, ...) needs to be ordered with confchg events. This patch splits multicast messages into two queues, block_list and nonblock_list. All block messages (e.g. vdi creation, etc) are linked into block_list queue, and other notification messages and confchg events are linked into nonblock_list. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 2068ff1b1d3d4f5ee1019886f3c5e1fbd692aea6 Author: Liu Yuan Date: Fri Aug 31 12:22:19 2012 +0800 test: consolidate 012 to git rid of spurious failure Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 7544bc5fb0e43edd61c6c647a128e740ef6dcafc Author: Liu Yuan Date: Fri Aug 31 11:23:05 2012 +0800 trace: fix compile error without --enable-trace Signed-off-by: Liu Yuan commit 99e1670ac7f4ff1c1d307a5b5e738e56472de101 Author: MORITA Kazutaka Date: Thu Aug 30 18:02:58 2012 +0900 tests: use new cache options Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 22fad00ba59388fd690ca164d3fd28c2060298f0 Author: MORITA Kazutaka Date: Thu Aug 30 18:02:57 2012 +0900 collie: refine cache option of vdi write What we should set when writing objects is not "whether we use object cache" but "whether we use a writeback mode". Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 91e6884e6e7b65ce59d24ed4794c8696f5df5e71 Author: MORITA Kazutaka Date: Thu Aug 30 18:02:56 2012 +0900 sheep: use qemu cache option to use writeback/writethrough mode When writeback is enabled, SD_FLAG_CMD_CACHE flag is set to I/O requests. This patch uses it for write mode. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 425a01ca0bb74bed8c3627d57367bfedfc75c6c6 Author: Liu Yuan Date: Thu Aug 30 17:32:44 2012 +0800 trace: fix ret_stack_index type Sometimes it can be negative, in this case it will be too big as unsigned. Also mark tow functions as notrace because the are producing too high volume of trace. Signed-off-by: Liu Yuan commit 8edaf0df53b6e8671471f8ff913e33856adbafed Author: Liu Yuan Date: Thu Aug 30 17:25:51 2012 +0800 trace: change the opcode name Signed-off-by: Liu Yuan commit c5630ed145809be5c2b3e435ea120fa8b6dae494 Author: Liu Yuan Date: Thu Aug 30 15:48:45 2012 +0800 collie: rework debug trace funtions to use subcommand mechanism The new look is: tailai.ly@taobao:~/sheepdog$ collie/collie debug trace Usage: collie debug trace {start|stop|cat} [-a address] [-p port] [-h] Available subcommands: start start the trace stop stop the trace cat cat the trace Options: -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit The change is that 'stop' command actually read all the buffer to /tmp/tracefile then we can 'cat' this file until it is overridden by next trace. Signed-off-by: Liu Yuan commit b09f371a9e08a67e3bdad11032fe15a95652c8bd Author: Liu Yuan Date: Thu Aug 30 14:56:50 2012 +0800 sheep: rename worker threads for gateway and revoery Because they use short threads, their name is actually combination of name string and idx. The idx might be big, so we try to keep short name for short threads. Signed-off-by: Liu Yuan commit c3c450b225166ac5d9c6b0f4eba7a5ac52e3ecf7 Author: Liu Yuan Date: Thu Aug 30 14:56:50 2012 +0800 trace: rework the trace mechnasim to work with current code There are mainly two changes: 1 remove dumb ring_buffer and use strbuf to store trace items 2 use another mechanism to suspend/resume worker threads for dynamic patching. As before, if users don't configure with --enable-trace, there is actually zero impact on the code base. All the tracer funtions will be complied out. With reworks, add a new field to trace output: thread name (worker_name + worker_idx). For e.x, we can only filter the trace output and get the gateway thread trace: $ collie/collie debug trace start ... cluster is operating $ collie/collie debug trace stop $ collie/collie debug trace cat > t $ grep 'gway 27' t | > t1 $ cat t1 gway 27 | | do_process_work() { gway 27 | | gateway_write_obj() { gway 27 | | gateway_forward_request() { gway 27 | | gateway_to_peer_opcode() { gway 27 | 0.793| } gway 27 | | get_sd_op() { gway 27 | 0.598| } gway 27 | | get_req_copy_number() { gway 27 | 0.547| } gway 27 | | sheep_get_sockfd() { gway 27 | | addr_to_str() { gway 27 | 1.904| } gway 27 | | sockfd_cache_search() { gway 27 | 0.675| } gway 27 | | xmalloc() { gway 27 | 1.36 | } gway 27 | 19.773| } gway 27 | | send_req() { gway 27 | 29.548| } gway 27 | | sheep_get_sockfd() { gway 27 | | addr_to_str() { gway 27 | 2.499| } gway 27 | | sockfd_cache_search() { gway 27 | 5.33 | } gway 27 | | xmalloc() { gway 27 | 1.859| } gway 27 | 35.478| } gway 27 | | send_req() { gway 27 | 27.173| } gway 27 | | sheep_do_op_work() { gway 27 | | peer_write_obj() { gway 27 | | do_write_obj() { gway 27 | | strbuf_addf() { gway 27 | 5.809| } gway 27 | | jrnl_begin() { gway 27 | | xzalloc() { gway 27 | | xmalloc() { gway 27 | 1.143| } gway 27 | 3.141| } gway 27 | | xpwrite() { gway 27 | 41206.762| } gway 27 | | xpwrite() { gway 27 | 27106.822| } gway 27 | | xpwrite() { gway 27 | 16433.765| } gway 27 | 84822.558| } gway 27 | | default_write() { gway 27 | | get_obj_path() { gway 27 | 1.873| } gway 27 | | xpwrite() { gway 27 | 37716.486| } gway 27 | 37739.304| } ... Signed-off-by: Liu Yuan commit 5264ff4a4987d62f53d62d336bb2265867702645 Author: Liu Yuan Date: Thu Aug 30 14:56:50 2012 +0800 logger: add a helper to get the thread name This patch prepare for trace patch set. Signed-off-by: Liu Yuan commit df6e27a2102ac52e9a6b3fde1b70be52b8334bdd Author: Liu Yuan Date: Thu Aug 30 14:56:47 2012 +0800 lib: add a helper to strip out strbuf This patch prepares for trace patch set. Signed-off-by: Liu Yuan commit 1bfc4676c098eb171c6424d83514a7afb28d2a63 Author: MORITA Kazutaka Date: Thu Aug 30 09:36:16 2012 +0900 tests: add tests to check message order of format operation Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit c4906d14d824ca7f69aa8caabbd3180719b7562f Author: MORITA Kazutaka Date: Thu Aug 30 09:36:15 2012 +0900 tests: add check of the number of running sheep processes Without this patch, check script cannot return from _wait_for_sheep() when one of sheep processs exits unexpectedly. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit d8f98eb71128bfadb3cd0858c7879ae8df2a7c35 Author: MORITA Kazutaka Date: Thu Aug 30 03:42:33 2012 +0900 plain_store: print more information on error path This makes debugging easier. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 8555a67353e87b0292e90a5d59e3d60a26f6eace Author: levin li Date: Wed Aug 29 17:08:12 2012 +0800 collie: don't fetch node list if not use Signed-off-by: levin li Signed-off-by: Liu Yuan commit 8b7ab69d60918d940a021679aeafde9388f31eee Author: levin li Date: Wed Aug 29 14:35:58 2012 +0800 sheep: use short thread for recovery Most time sheep isn't in recovery state, so the two recovery work queue recovery_wqueue/recovery_notify_wqueue stay there useless, since we have short thread, we can save resource by creating the thread when recovery needs Signed-off-by: levin li Signed-off-by: Liu Yuan commit 9c99212ce7d17725a8f2b844df9268c0f19f3007 Author: levin li Date: Wed Aug 29 15:21:34 2012 +0800 sheep: initialize reclaim work queue only when cache is enabled Signed-off-by: levin li Signed-off-by: Liu Yuan commit 35f4b6787d18f31e152309561e15572925564412 Author: Liu Yuan Date: Wed Aug 29 11:41:17 2012 +0800 test: fix 003 for local driver Local driver detect membership change by signal in 1 s internal, so kill &sleep 1 might have no effect for local driver. This might cause random spurious failure of test. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 4cbcd960f2efd371b1f828562ef345db55ccfaeb Author: MORITA Kazutaka Date: Wed Aug 29 02:18:33 2012 +0900 cluster/corosync: retry when CS_ERR_TRY_AGAIN is returned cpg_initialize often fails with a CS_ERR_TRY_AGAIN error. We should retry in such case. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 9b0c237e27045cbd0bb83363e943b7baeea54178 Author: MORITA Kazutaka Date: Wed Aug 29 01:43:08 2012 +0900 sheep: handle SD_FLAG_CMD_WRITE in cluster requests If SD_FLAG_CMD_WRITE is set, req->data means sent data, so we don't need to set response data in sd_notify_handler(). Otherwise, it means receive data buffer, so we don't need to set multicast data in prepare_cluster_msg(). This also fixes some local requests which don't set SD_FLAG_CMD_WRITE. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 8df7ff4c08500dbe92e01d821a661d8bd1615215 Author: levin li Date: Tue Aug 28 16:14:54 2012 +0800 sheep: no need to set SD_FLAG_CMD_WRITE for local request Signed-off-by: levin li Signed-off-by: Liu Yuan commit e6e939401753434a935e39539001e445983f8ebc Author: Liu Yuan Date: Tue Aug 28 11:31:19 2012 +0800 recovery: fix wrong index in oid_in_recovery() rw->done is index of the original next object to be recovered and also the number of objects already recovered. So rw->done - 1 identify an already recovered object. A another caller case is that, if rw->done == 0, we'll end up with a -1 valule, which might cause seg fault in the array. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c3c5eff9a8d37cb5dc74bc4b6b8962ed6829e9fd Author: Liu Yuan Date: Tue Aug 28 11:17:08 2012 +0800 collie: remove cluster cleanup command We for now automatically clean up useless objects generated from recovery process. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit b3442e66f416cfb23c0f7f8b8cc1c3263de5565d Author: Liu Yuan Date: Mon Aug 27 22:19:08 2012 +0800 sheep: fix dead lock for create_and_write request This dead lock can be reprodiced by 026. We should always service CREATE_AND_WRITE request instead of queueing it on wait queues while in recovery. The recovery can be finished without any any objects in the list (rw->count == 0 in some special case), in which case that no one calls resume_wait_recovery_requests() or other flusher on rw_list or obj_list. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 527603319081d75525287f17e2ba7ddc68f2a5ab Author: Liu Yuan Date: Mon Aug 27 22:19:07 2012 +0800 test: refine 026 to catch more bugs With this change, current master will fail most of time. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 2f087c06492baab0ab63c185026fe44f81581007 Author: levin li Date: Mon Aug 27 19:39:46 2012 +0800 collie: fix error output of 'collie node info' Signed-off-by: levin li Signed-off-by: Liu Yuan commit 45c1e4b5fe4a66cf442a019766fc25d050f78ae5 Author: levin li Date: Mon Aug 27 18:45:07 2012 +0800 sheep: fix a bug that hangs the cluster during recovery During recovery, a VDI creation request may waits for recovery to complete, and VDI creation request is a cluster request which prevent other cluster requests being processed, when recovery comes to notify_recovery_completion_work, it issues another cluster request with SD_OP_COMPLETE_RECOVERY which is blocked by VDI creation, and as result, notify_recovery_completion_work blocks the recovery_wqueue, if a new recovery comes, it's blocked, at the same time, a VDI creation request may waits for this recovery to complete, so it's a dead lock. Signed-off-by: levin li Signed-off-by: Liu Yuan commit c5ee72177c24474bb10ba3f0e37d567b64438830 Author: Liu Yuan Date: Mon Aug 27 18:29:18 2012 +0800 sheep: cleanup string format for epoch Signed-off-by: Liu Yuan commit e8c4069612b40abb8e949144e2bd3b7a355614a1 Author: Liu Yuan Date: Mon Aug 27 17:32:26 2012 +0800 net: add a send timeout for sockfd This fixes the bug found by 035. This is kind of a subtle case: 'sendmsg' of A would fill its sock buffer to full whlile the other end B is failed in the middle of the 'sendsmg'. The sock buffer of A is full, then kernel block the thread awaiting on the buffer drain, which will never happen because B is down. Keepalive won't be triggered because the kernel detect that sock buffer has data to send. So the final result is that, the blocked thread will never be woke up. Signed-off-by: Liu Yuan commit dc3cfe88d9217a86de1ab1ec0e9155e36a3ff974 Author: Liu Yuan Date: Mon Aug 27 17:32:18 2012 +0800 test: add a test for sockfd keepalive Signed-off-by: Liu Yuan commit a9382123637abd8db52f31f44f0ff944306833ac Author: MORITA Kazutaka Date: Sun Aug 26 18:31:36 2012 +0900 plain_store: store stale objects to .stale/oid.epoch This is necessary to pass test 034. It is wrong to store stale objects without epoch info. It is because if the objects are updated while leaving cluster, sheep cannot detect it and uses the local stale objects as the latest one. It is mandatory to store stale objects with epoch information like old farm code or simple store. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 8a730cdf779f277b51d2fe24c61b25bd41e110f8 Author: MORITA Kazutaka Date: Sun Aug 26 17:58:09 2012 +0900 sheep: add an argument to for_each_object_in_wd This prepares for the next patch. This also makes object_nr in farm/trunk.c a local variable. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 947c2e78ee68245ca69b639a45b16f74c79ba2bc Author: MORITA Kazutaka Date: Sun Aug 26 19:24:05 2012 +0900 tests: add test to check sheep joining with old objects This tests whether clients can read the latest data when sheeps join to the cluster with old objects. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit a606ad26c42ff9b072bccb95c9dfe96dc094b9e3 Author: MORITA Kazutaka Date: Sun Aug 26 17:23:32 2012 +0900 plain_store: remove tmp objects in for_each_object_in_wd Currently, sheep could call a callback function against tmp objects. This fixes the problem. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 34646e119098a8ef7406fda73631155e3704ef54 Author: MORITA Kazutaka Date: Sun Aug 26 09:29:09 2012 +0900 sheep: remove stale objects after recovery fully completed This notifies SD_OP_COMPLETE_RECOVERY to all nodes when object recovery finishes. Sheep removes stale objects when it receives SD_OP_COMPLETE_RECOVERY from all nodes. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit d20f3e1162239b9425b2c2a5c7a10e2633457df0 Author: MORITA Kazutaka Date: Sun Aug 26 12:49:02 2012 +0900 tests: add check of stale object cleanup to 032 and 033 Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 449ee5f79cae384fd8d629ac845169af283c9bfc Author: Liu Yuan Date: Sun Aug 26 19:37:09 2012 +0800 recovery: continue to recovery even when we don't get a valid epoch When all the alive nodes don't have the valid epoch, we rollback to continue recovery object from earlier epoch. This bug is found by 028. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit f85c484cd0815b410c566e6958982a1a3dfac5cb Author: Liu Yuan Date: Sun Aug 26 19:07:48 2012 +0800 test: consolidate 028 With the updated version, current master can't pass it. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit b7a8ecabce6095e80f275512bdc5b66407778f19 Author: Liu Yuan Date: Sun Aug 26 10:25:39 2012 +0800 test: remove xxx.full file in 011 014 - also fix a bug in _wait_for_sheep Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit d068065d0444d4b72becafac4034133a16f6ee4d Author: MORITA Kazutaka Date: Sun Aug 26 10:04:52 2012 +0900 store: make objects stale when sheep starts It is unsafe to remove objects in the working directory when starting sheeps because it is not sure that the objects are correctly recovered to other nodes' working directory. We should make them stale objects rather than removing them. Signed-off-by: MORITA Kazutaka commit adf13b6e15f300bdacbd62ac787ab3e6522cd89b Author: MORITA Kazutaka Date: Sun Aug 26 10:04:51 2012 +0900 tests: add test to check recovery after many nodes failed Signed-off-by: MORITA Kazutaka commit 535b1dd8be1ca0fb499bbc4c1c479fcaf8877ab1 Author: MORITA Kazutaka Date: Sun Aug 26 02:21:41 2012 +0900 sheep: don't allow get_obj_copy_number to return more than nr_zones This is necessary to recover objects when we have less zones available than the desired redundancy. Signed-off-by: MORITA Kazutaka commit ce32aa70f131b56af58bc62028e1f1ea8d6e9a63 Author: MORITA Kazutaka Date: Sun Aug 26 02:21:39 2012 +0900 tests: check results of executed commands more strictly Signed-off-by: MORITA Kazutaka commit cc9f40b6b62fffecec35d307728e26ae39653e0f Author: MORITA Kazutaka Date: Sun Aug 26 02:21:40 2012 +0900 tests: add test of recovery when we have less zones availability Signed-off-by: MORITA Kazutaka commit 24c93dcab4c392e1d6351cc75f263bdf526e3e44 Author: Liu Yuan Date: Sat Aug 25 18:42:45 2012 +0800 farm: clean up snap.c Remove useless code for recovery handling. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit b48ea8eb25f8db0d2bfd2eb2b9f2860ac6fe5b7c Author: Liu Yuan Date: Sat Aug 25 17:32:56 2012 +0800 object cache: remove flock for IO request To quote from Kazutaka: "It is okay to read the older data if the vm doesn't receive the ack of the write requests yet." So we don't need locks either for object cache. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit fb811b92fbab712b420440cfe84e6b7d37bda6da Author: MORITA Kazutaka Date: Sat Aug 25 11:58:20 2012 +0900 store: use O_DIRECT only for data objects init_vdi_copy_number() uses O_DIRECT for vdi objects too. This patch introduces get_open_flags to avoid making this kind of mistake any more. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit b2076a2e070d17c1342188a853f8407093d9aad2 Author: MORITA Kazutaka Date: Sat Aug 25 11:58:19 2012 +0900 tests: add test to check vdi list after cluster startup Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit e8942a1bbbd6890c04d979a2292bc94776f7726f Author: Liu Yuan Date: Sat Aug 25 09:54:11 2012 +0800 test: consolidate 011 to support different filesystem type Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 853e52653335e17790f36136fcdd1591cddf239b Author: Liu Yuan Date: Fri Aug 24 20:10:17 2012 +0800 test: add a test for cluster wide snapshot Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit bb2289c924c700870095d70a4f0454ccde9ec52b Author: Liu Yuan Date: Fri Aug 24 20:10:16 2012 +0800 farm: rework trunk logic Since we move recovery handling out of farm backend, we don't need track IO requests on object, this leave most of functions in trunk.c useless for now. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 369668d56c921ab7ca95b711b45ddbc89881f3f7 Author: Liu Yuan Date: Fri Aug 24 20:10:15 2012 +0800 farm: use default_format for formatting - refine the interface to get name string directly Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit df808dbbcbad610b55d8550ed5a5a3215389c165 Author: Liu Yuan Date: Fri Aug 24 20:10:13 2012 +0800 plain: fix init_objlist_and_vdi_bitmap() We should add vdi copy number to copy list. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c219e3b356fd93a7bd82a5cbe63a1418094a97ac Author: Liu Yuan Date: Fri Aug 24 20:10:12 2012 +0800 farm: use plain implementation for core IO path This greately reduce the complexity of farm recovery handling. With this patch, Farm will operate entirely the same as plain store for IO and recovery handling. Farm and Plain store will share core IO functions completely. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 44fccbf733d97c46dbe5979f0f0b2e8ff6adf9c8 Author: levin li Date: Fri Aug 24 15:11:38 2012 +0800 collie: make `collie vdi list` not read vdi_inuse bitmap Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit db275efa62af7cbc0435205bf66c44146ec8d5cb Author: levin li Date: Fri Aug 24 15:11:37 2012 +0800 sheep: don't send vdi_inuse in get_vdis Since we already send vdi copy list in get_vdis(), there's no need to send the vdi_inuse bitmap any more, without it we can send less data as the length of vdi copy list is variable and it contains the vid just as vdi_inuse does. Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 936b6addcd6307b5173dc9f6f36cc89bdf669c85 Author: MORITA Kazutaka Date: Fri Aug 24 16:55:35 2012 +0900 sheep: add plain store driver This introduces a storage driver 'plain' based on the current storage interface. The design of the plain store is similar to one of the farm driver. The main difference is that the current farm uses the sha1 based backend store for stale objects, but simple_store uses a flat directory for them. With this design, plain store can move objects from the working directory to the backend store efficiently with rename(2). Here are pros vs cons of the plain store against the current farm. Pros: - faster recovery - smaller and simpler - would be a good example to introduce other storage drivers Cons: - cluster snapshot is not supported - stale objects are not deduplicated - there is no sha1 verification Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit ace0ea7e95fd41deac1a8417d82e2cc540146d0b Author: MORITA Kazutaka Date: Thu Aug 23 22:35:13 2012 +0900 sheep: fix vdi exsitence check Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 6b7838220412aa98681e26f50470b3281ccc9d05 Author: Liu Yuan Date: Fri Aug 24 10:31:46 2012 +0800 test: allow more free style test description 23-25 doesn't begin with 'Test'. Instead of modifying it, we'd better allow more free style description Signed-off-by: Liu Yuan commit 20ae154e2601aaf18991a0f3c1218ff248907267 Author: levin li Date: Thu Aug 23 19:34:54 2012 +0800 test: add a test case for different VDI redundancy level Signed-off-by: levin li commit 87577d04ce66f62c623c273f2ae5767e3d2b0b95 Author: levin li Date: Thu Aug 23 19:34:53 2012 +0800 sheep: fix cloning VDI fail when cloned VDI has greater nr_copies than the base Signed-off-by: levin li commit 0b01dec8ac1b7cb60c340e66ecf30b3616902341 Author: levin li Date: Thu Aug 23 19:34:52 2012 +0800 sheep: return the copy number of the request object in peer_read_obj collie uses this return value in the output of 'collie vdi object', without this, we can not pass test 028 Signed-off-by: levin li commit 9404b9da8d3081bb7c9fbca5bae601201bcb8a5a Author: Liu Yuan Date: Thu Aug 23 19:27:34 2012 +0800 test: add a test description to output With this patch, the new look is: tailai.ly@taobao:~/sheepdog/tests$ sudo ./check 27 PLATFORM -- Linux/x86_64 taobao 2.6.35-32-generic 027 Last Used:7s. Test sheep recovery logic breadth-first algorithm Passed all 1 tests Signed-off-by: Liu Yuan commit af1453222c1b0246369d42567497162c171b3368 Author: Yunkai Zhang Date: Thu Aug 23 17:31:43 2012 +0800 collie: add private options to collie's command Now, all collie's command share the same global collie_options, it will lead to option's name conflict among commands if they use the same options but with different description. By moving the private options into individual structure of each command, and make collie_options only contain the common part of them, we can solve this problem. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit bda398e4c20b2451bb220bba4950fb8e52059404 Author: levin li Date: Thu Aug 23 11:48:42 2012 +0800 collie: show copy number in the output of 'collie vdi list' Signed-off-by: levin li Signed-off-by: Liu Yuan commit 214608fa8531e0f7195d20dd15f7ece6f1aae9c4 Author: levin li Date: Thu Aug 23 11:48:41 2012 +0800 sheep: remove some unused copies related functions Signed-off-by: levin li Signed-off-by: Liu Yuan commit afb0fc499cca0857e6f7586e2cf31253e0cd2989 Author: levin li Date: Thu Aug 23 11:48:40 2012 +0800 sheep: use the specified copies number for IO requests In gateway_read{write,create_and_write}_obj, and read{write}_object, we should use different copies number for different requests and objects, instead of using the global copies number sys->nr_copies or calculated from vnodes Signed-off-by: levin li Signed-off-by: Liu Yuan commit f794b130c1c7015ea7a9cfb9d4306b39abe227b8 Author: levin li Date: Thu Aug 23 11:48:39 2012 +0800 sheep: fetch vdi copy list after sheep joins the cluster The new joined node doesn't have the vdi copy list, or have incomplete vdi copy list, so we need to fetch the copy list data from other nodes Signed-off-by: levin li Signed-off-by: Liu Yuan commit 571e882f6fd3fb87859281b4393d8c7b77e14614 Author: levin li Date: Thu Aug 23 11:48:38 2012 +0800 sheep: add vdi_copy tree to store copies number for every VDI The normal requests from QEMU include the copies number in its header, QEMU stores the nr_copies the first time it reads the inode data, but many other local requests such as read_copy_from_replica and recover_object_from_replica don't know the copies number for every object, so this tree is necessary to keep the copies number Signed-off-by: levin li Signed-off-by: Liu Yuan commit 9da9885a3d627b2be44def4bcf4d9d3ae89f4477 Author: levin li Date: Thu Aug 23 11:48:37 2012 +0800 sheep: rename ctime to create_time in vdi.c 'ctime' shadows a variable in pthread.h, in order to use pthread in vdi.c, so we need to rename ctime to get rid of the compile warning Signed-off-by: levin li Signed-off-by: Liu Yuan commit 6f211d37519601f5e2c793ad6a7b6d61aa9404d6 Author: levin li Date: Thu Aug 23 11:48:36 2012 +0800 collie: add -c for 'collie vdi create' to specify redundancy level for per-vdi Signed-off-by: levin li Signed-off-by: Liu Yuan commit c8a4bc56083c17bfdd9a3047fcb536d2e0fd0337 Author: levin li Date: Thu Aug 23 11:48:35 2012 +0800 sheep: use struct vdi_iocb to simplify the vdi_create api Signed-off-by: levin li Signed-off-by: Liu Yuan commit 92c8a028f2705e8d7c7761204796753e8871f636 Author: levin li Date: Wed Aug 22 16:06:39 2012 +0800 test: wait to kill other nodes until the recovery finishs in 026 We kill two nodes in test 026 and then kill another two nodes, if the recovery doesn't finish, four nodes leave may cause object lost, as our default redundancy level is 3. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 31563bffcfab1db8a469602e8c2c747c2493c561 Author: levin li Date: Wed Aug 22 16:01:17 2012 +0800 test: sort the output in test 027 The order of the output of 'find' may be random, so we should sort the output first before we compare the output with 027.out Signed-off-by: levin li Signed-off-by: Liu Yuan commit 5e803bca4d34a1505b5b5ae5a7565f12b1b205ec Author: Liu Yuan Date: Wed Aug 22 15:09:59 2012 +0800 test: consolidate 009 and 028 - use helper for wait recovery at 009 - check all the objects in 028 Signed-off-by: Liu Yuan commit 37142f90c88e0e07a5818fc16a3e809868343abb Author: levin li Date: Wed Aug 22 13:08:37 2012 +0800 tests: kill sheep with signal KILL In some case, we use pkill -f to kill a sheep node, and the log process isn't killed immediately and becomes a defunt process which take problems for the next start of the same node. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 0952b53cfaca0562764414bf8d04bc41272908b6 Author: Liu Yuan Date: Wed Aug 22 14:31:57 2012 +0800 test: add another test for sheep recovery logic Signed-off-by: Liu Yuan commit fa20c0af759c5770c6cd03519859a0caf1ac695c Author: Liu Yuan Date: Tue Aug 21 22:39:38 2012 +0800 test: add test for recovery logic Signed-off-by: Liu Yuan commit 3e1d21591d4bd8cb68546e25de06fdb05746a8fa Author: MORITA Kazutaka Date: Tue Aug 21 03:37:48 2012 +0900 sheep: make config file compatible with the previous one Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 441eb5dba697325e7d860ee69e6fc96197549d37 Author: Liu Yuan Date: Tue Aug 21 11:03:52 2012 +0800 test: consolidate 010 to check manual recovery Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c76d16a8414af98bb46750dae1425ccd85c076ca Author: Liu Yuan Date: Mon Aug 20 13:18:08 2012 +0800 test: fix 026 object loss problem We can kill at most 2 nodes in one go with 3 copies. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c1b2566f310adec7818d6959acbff5c78d66778a Author: Liu Yuan Date: Mon Aug 20 12:54:12 2012 +0800 test: make 025 to run with corosync driver Since we don't use _start_sheep() helper, we need to manually add -y option to it Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit f89657b5f51ba87b016d2f0cca30728b0dfe6eb5 Author: Liu Yuan Date: Mon Aug 20 11:57:54 2012 +0800 test: add two helpers to start/kill sheep Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 3af00fafc7657dcf1b46b079782f3da057f3c3d7 Author: Liu Yuan Date: Mon Aug 20 11:57:55 2012 +0800 test: add corosync driver option zookeeper driver is not added yet because I don't know to add option that can take parameters Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit d09a0c01e67eaa3af169d21e5af1657d47b466ef Author: Liu Yuan Date: Fri Aug 17 13:56:44 2012 +0800 test: test vdi create operation during node changes Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit bdfd0b1d59435a1c6c9f588b6da970bdf41d9433 Author: Christoph Hellwig Date: Fri Aug 17 08:18:38 2012 -0400 tests: check for a recovery segfault in older sheepdog versions Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 8cfff805b98319b14dd98f803fd1a605dc43cddb Author: Christoph Hellwig Date: Fri Aug 17 08:18:37 2012 -0400 test: check that data can be read from a node right after joining Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit c9e7b43fa63d47cb22617377bb8000a95c8cb602 Author: Christoph Hellwig Date: Fri Aug 17 08:18:36 2012 -0400 tests: check cluster info after a join storm Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 204eeb82d3d72a503dea81f357f21d1a7f4fe402 Author: Christoph Hellwig Date: Fri Aug 17 08:18:35 2012 -0400 tests: check that creating a vdi without data sheep fails Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 7fa87bd72bd94253af4a69abed03b90c2bdcdc45 Author: Christoph Hellwig Date: Fri Aug 17 08:18:34 2012 -0400 tests: check that formatting a cluster of gateways works Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 3b9be81352892fc8cb3568a59c4a047c4f6fd6cc Author: Christoph Hellwig Date: Fri Aug 17 08:18:33 2012 -0400 tests: improve _wait_for_sheep Error out if too many sheep are in the cluster, reduce the wait time, and improve the grammar in the comment explaining it. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 884d9806ef91adceea24d7a098036708cbf99948 Author: Liu Yuan Date: Tue Aug 14 16:18:56 2012 +0800 test: add tests for object cache Signed-off-by: Liu Yuan commit fb042b9d251d7bde40d1429f65a86c61a1dbd9b1 Author: Liu Yuan Date: Tue Aug 14 16:18:56 2012 +0800 collie: add flush command for vdi Signed-off-by: Liu Yuan commit 0d36d1d5df35a0bb35ec84819d95be31074fcfda Author: Liu Yuan Date: Tue Aug 14 16:18:53 2012 +0800 collie: add cache support for 'vdi write' This is for tests sciprts. Signed-off-by: Liu Yuan commit 98d52fb6bd9710f737e60514288af374afd83ff3 Author: Liu Yuan Date: Tue Aug 14 12:57:31 2012 +0800 test: add a HOWTO doc Signed-off-by: Liu Yuan commit 5e0a9742d0b9cfff104e11c91a9b39a8937f41c2 Author: Liu Yuan Date: Tue Aug 14 12:57:28 2012 +0800 update .gitignore Signed-off-by: Liu Yuan commit 20b94de75c5019829a54e6c4f8d2e5d2b63371e0 Author: Liu Yuan Date: Mon Aug 13 19:17:52 2012 +0800 test: fix 006 We should wait all the sheep to join completion before issuing any collie command - add a helper _wait_for_sheep(nr_sheep) Signed-off-by: Liu Yuan commit 3a99b88571230faca346157c9820e6e460c4371e Author: Liu Yuan Date: Mon Aug 13 19:15:58 2012 +0800 test: fix 015 add a new helper to wait for collie command completion Signed-off-by: Liu Yuan commit 6ba82f14e35dbc4b0249a6e82f693dbd9c59e207 Author: Liu Yuan Date: Mon Aug 13 19:12:21 2012 +0800 tests: fix 016 We should specify the desired disk space to pass the test Signed-off-by: Liu Yuan commit 0ef946c7836ecccc0e7e5ac5d14b381f9d4a6e3f Author: Liu Yuan Date: Mon Aug 13 19:12:11 2012 +0800 sheep: use rintf() to get the vnode calc right Signed-off-by: Liu Yuan commit 8d6bbf4084bc8f21c9b0a78def27bcf71fdb3364 Author: Liu Yuan Date: Mon Aug 13 15:55:15 2012 +0800 sheep: fix stat_sheep() - use sys->disk_space instead of a duplicate sys call - use bytes internally for disk space calculation Signed-off-by: Liu Yuan commit aa0a1c8bd1c4239eb183b1ab3a527dec8bc63025 Author: Liu Yuan Date: Mon Aug 13 15:55:12 2012 +0800 test: fix 011 to umount the right mountpoint Signed-off-by: Liu Yuan commit 9a8549df7a6d800a2a1c64f5f1230568722b3664 Author: Christoph Hellwig Date: Fri Aug 10 10:38:17 2012 -0400 initial xfstests-style test suite Run using "./check" in the test directory, individual tests can be run using "./check 003", various other options are also supported. Test 002 is disable for now due to the lack of golden output, cluster drivers other than local aren't tested at the moment but will be supported soon. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit fd403d502d389da11580d0a15b55c316e8d3e68f Author: MORITA Kazutaka Date: Tue Aug 7 18:52:58 2012 +0900 add test scripts This also removes old tests which are no longer used. Reviewed-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit fa58c9275f4094b80885fd9d9019b27dc61864a6 Author: MORITA Kazutaka Date: Tue Aug 7 20:11:34 2012 +0900 farm: fix comparison of empty sha1 Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 4795f57e98c0d449cc4cc7d1ebbbe1fa52727afb Author: Christoph Hellwig Date: Mon Aug 6 11:07:24 2012 -0400 logger: split the main loop into a separate function Refactor the logger code so that its main loop is a separate function instead having it inside log_init. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 1c30269ebd9c2747afa33effe9a6c11b7672aaf6 Author: Christoph Hellwig Date: Mon Aug 6 11:07:23 2012 -0400 logger: keep the log fd private to the logger process We only need to access the log file from the logger process, so make sure it only is available there and not exposed to the sheep which will keep (leak) a reference to the initial one and won't be able to use it after is has been rotate. Also kill the unused log to syslog feature to make the change easier. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 510117ef47a59b929a99483586cb9d3da9092f8b Author: Christoph Hellwig Date: Mon Aug 6 11:07:22 2012 -0400 logger: do not expose implementation details in logger.h Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit e60d78dd14cb649ae3aa12102c569642b26f9e3e Author: Christoph Hellwig Date: Mon Aug 6 11:07:21 2012 -0400 logger: do not leak fds when rotating log files dup2 creates a new reference to a fd, so we have to close the old reference. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit a5fb626e9b503d5df837f030211f312196a965f5 Author: levin li Date: Mon Aug 6 10:14:15 2012 +0800 sheep: fix 'collie cluster cleanup' crash in gateway-only node In gateway-only node, store driver maybe unitialized after restart, so cluster_cleanup() in gateway-only node crashed when we try to call sd_store->cleanup() Signed-off-by: levin li Signed-off-by: Liu Yuan commit 1ad0a92b954ba6d740af2d0599b9581d32ff8709 Author: levin li Date: Mon Aug 6 10:14:14 2012 +0800 sheep: fix 'collie vdi object' crashes in a gateway-only node When collie tries to read directly from peer which maybe a gateway-only node, whose store driver is uninitialized, sd_store->read() may causes sheep crash, so in peer_read_obj() we directly return SD_RES_NO_OBJ if the node is a gateway-only node Signed-off-by: levin li Signed-off-by: Liu Yuan commit 68b9fd59a3e82a816a6b282aa82a49cf4f34900f Author: levin li Date: Sun Aug 5 23:21:14 2012 +0800 sheep: remove SD_MAX{DEFAULT}_REDUNDANCY use SD_MAX{DEFAULT}_COPIES instead Signed-off-by: levin li Signed-off-by: Liu Yuan commit a8627f6902c3521d48eda3ddb3982b3195fd5e3d Author: levin li Date: Sun Aug 5 23:55:54 2012 +0800 sheep: fix a crash caused by float point exception in recalculate_vnodes When sheep starts as a gateway-only node firstly in the cluster, there's no non-gateway-only nodes and thus nr_non_gateway_nodes is zero, and it casues to divide zero which causes gateway-only node crash Signed-off-by: levin li Signed-off-by: Liu Yuan commit fc6d0030a85685e8a6e9c3f44ca36be7b58f80e8 Author: Liu Yuan Date: Sun Aug 5 23:15:51 2012 +0800 collie: fix collie node recovery -add a helper to get the light request's response Signed-off-by: Liu Yuan commit aa57a94e5bbac76f363e568f56cdf8a5c3f74f1a Author: Liu Yuan Date: Sun Aug 5 20:08:59 2012 +0800 sheep: remove unused function same_node() Signed-off-by: Liu Yuan commit 0e4a9d83cd0481aaedac59cd053ec6a2b8a199f3 Author: Yunkai Zhang Date: Sun Aug 5 19:24:01 2012 +0800 sheep: correct current_vnode_info's initialization when recover disabled After disabled recovery, a joining node needs to initialize current_vnode_info. Since it doesn't belong to the cluster before next recovery finished, current_vnode_info should be calculated excluding this node. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 8754e1642bd5d1f695f995b58fd7ddc0e0d04ab1 Author: Jens Weber Date: Sun Aug 5 13:28:52 2012 +0200 Update collie.8 man page Add new cluster recover options: Usage: collie cluster recover {info|force|enable|disable} [-a address] [-f] [-p port] [-h] Available subcommands: info show the status of recovery to user force force recover cluster immediately enable enable automatic recovery and run once recover if necessary disable disable automatic recovery caused by JOIN events (excluding LEAVE events now) Signed-off-by: Jens Weber Signed-off-by: Liu Yuan commit b1bbfacf207afb3d2161975021ee6157dff34bce Author: Jens Weber Date: Sun Aug 5 13:19:36 2012 +0200 Update sheep.8 man page - add -w cache_size{,writethrough | writeback} - delete -v option - add -s size, specify the free disk space in megabytes Signed-off-by: Jens Weber Signed-off-by: Liu Yuan commit bdbd7bfa8f548e21c8a7a6e19cdee67f94d6a7c4 Author: levin li Date: Sat Aug 4 19:38:54 2012 +0800 sheep: fix a bug of objects loss when some node has zone set to 0 Before the patch 5b27736abffc3fe569b4ec5335806ad345e279b9 it's OK to set zone to 0, because we skip the same node in get_nth_page(), but after that patch, we just check same_zone() in get_nth_page(), and same_zone() return false when zone of some node is 0, so there's problem, it's reasonable for users to set zone to 0, so this patch fixes this problem Signed-off-by: levin li Signed-off-by: Liu Yuan commit 1ae25e638970bace518267f773cb4fefd0f30860 Author: Liu Yuan Date: Sun Aug 5 14:53:11 2012 +0800 ops: add missing type name for SD_OP_GET_OBJ_LIST - correct name for SD_OP_GET_EPOCH Signed-off-by: Liu Yuan commit 5b3fc6b157a8985e4aa2ead26fcbbc203f690c5d Author: Yunkai Zhang Date: Sun Aug 5 14:38:22 2012 +0800 sheep: rename ->vnodes which type is vnode_info to ->vinfo The type of request->vnodes is vnode_info, let's rename it to ->vinfo which will be more descriptive. Cleanup other places as possible as I can according this naming rule. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit bd5ed94f68fd17fe0a2390b47bbb70586680c269 Author: Yunkai Zhang Date: Sat Aug 4 21:00:17 2012 +0800 sheep: refactor oid_to_vnode[s] and cleanup obj_to_sheep[s] Folds obj_to_sheep[s] into there callers oid_to_vnode[s] so that every one should use oid_to_vnode[s] to map oid to vnode[s], which will make code more descriptive. Refactor the interface of oid_to_vnode[s] so that other caller who without vnode_info but only have vnodes can reuse these functions. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 4314fcb0c3282d2ea2c97f9fd198e6bf913d084b Author: Yunkai Zhang Date: Sat Aug 4 18:17:19 2012 +0800 sheep: refactor get_nth_node() and get_vnode_pos() oid_to_sheeps() calls get_nth_node() in a for-loop, but get_nth_node() will do duplicated works because it should find [0..(n-2)] vnodes indexs before it try to calculate n'th vnode index. The return value of get_vnode_pos() seems werid for user, it need to plus one before we can use it. This patch try to refactor these two functions: 1) split a new function named get_vnode_next_idx() from get_nth_node(). 2) rename get_vnode_pos() to get_vnode_first_idx(), and make the return value can be used directly. 3) rename get_nth_node() to get_vnode_nth_idx() which will be more descriptive. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit c0050e3557bcbb15e84b23a061236723dbdd8e9d Author: Liu Yuan Date: Sun Aug 5 14:21:44 2012 +0800 object cache: implement writethrough mode Object cache writethrough mode provide us a read-only cache which is alwasy consistent with backend store. We can set the object cache mode by 'w' option as following: sheep -w cache_size{,writethrough | writeback} For e.g, we can set 1G size writethrough cache: $ sheep -w 1000,writethrough $ sheep -w 1000 writethrough mode is default object cache to set as writeback cache: $ sheep -w 1000,writeback Signed-off-by: Liu Yuan commit b29e9c1e6f5556280c6de2ffafc318983e7be712 Author: Liu Yuan Date: Sun Aug 5 14:21:28 2012 +0800 object cache: remove redundant code We only need set CREATE bit in dirty_tree_and_list_insert() once. Signed-off-by: Liu Yuan commit 9266a1481d6fe9d6bcbc84c740b621ad3d6f813c Author: Liu Yuan Date: Sun Aug 5 14:21:28 2012 +0800 object cache: add a new helper to get the entry idx We should always use this helper to get entry idx. Signed-off-by: Liu Yuan commit 6114da8d650916aabb20a7f417a303ce9b86c6a9 Author: Liu Yuan Date: Sun Aug 5 14:21:28 2012 +0800 object cache: refactor read/write path Signed-off-by: Liu Yuan commit eb5e74e400e56816b340380a3cadf9f8c063627f Author: Liu Yuan Date: Sun Aug 5 14:21:21 2012 +0800 object cache: remove file lock in create_cache_object() For now the entry access is protected by get/put_object_entry(), when object is not added to the object list, other threads can't access it. Signed-off-by: Liu Yuan commit 50f9afca07c4c32355131a19b29c719b9b16b859 Author: Liu Yuan Date: Sun Aug 5 14:16:33 2012 +0800 update some info in README Signed-off-by: Liu Yuan commit 85b06624b925b297a864dc468357dde27469ccdf Author: levin li Date: Sun Aug 5 14:10:17 2012 +0800 sheep: remove command line argument --vnodes Since we use free disk space to determine vnodes number, there's no need to take the -v argument, if it's a gateway-only node, we set its vnodes number to be zero. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 6f3a89f361796591dd09f5cb46dd67be7c071f92 Author: levin li Date: Sat Aug 4 19:30:03 2012 +0800 sheep: recalculate the vnodes number when epoch changes When epoch changes, new node join or old node leave, we should recalculate the vnode_info for every sd_node, and the disk space is stored in sd_node, transfered to every other node together with join message. Signed-off-by: levin li Signed-off-by: Liu Yuan commit e7a80b671254a3714a9e3747954fa5e77868bcde Author: levin li Date: Sat Aug 4 19:30:02 2012 +0800 sheep: add user-defined free disk space size Sometimes user doesn't want sheep to use all the disk space, maybe some space in the same disk is for other use, such as object cache, or something else, so we can't only get the free space by syscall, we also should provide a way for users to specify the free disk space Signed-off-by: levin li Signed-off-by: Liu Yuan commit 5e39f98abf32faa340afc9b535a56391359b5234 Author: levin li Date: Sat Aug 4 19:30:01 2012 +0800 sheep: get the free disk space and store it in the config file We only specify the free disk space for the first boot, after that we store the size into the config file. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 543c44b368543b51ecb50477c7fe7b31b83b661c Author: levin li Date: Fri Aug 3 10:52:14 2012 +0800 sheep: set vdi_inuse bit only when VDI creation success Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 88d689642c94bd092c37d99c9161e7843d665000 Author: Liu Yuan Date: Fri Aug 3 10:33:29 2012 +0800 object cache: optimize thread scheduling Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit d652f24f00bef5045233083ff704b252a54beffe Author: Yunkai Zhang Date: Wed Aug 1 15:19:22 2012 +0800 collie: add cluster recover info command Show the status of recovery to user: $ collie cluster recover info Status: disable Joining nodes in inner temporary list: -------------------------------------- Id Host:Port 0 127.0.0.1:7002 1 127.0.0.1:7003 2 127.0.0.1:7004 Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit a9186fd8f0e133ba7065a579da1062650991e89a Author: Yunkai Zhang Date: Wed Aug 1 15:19:21 2012 +0800 sheep: do the real work of dsiable/enable recovery After disable recovery, all recovery operation in sd_join_handler will be paused. current_vnode_info will not be updated before enable reovery again. a disable_recovery variable was added in join_message so that joining sheep can share cluster's recovery status. Joining nodes will be stored into an inner temporary array which will be used when we enable reovery. At most one recovery operation will be executed when user sending "collie cluster recover enable" command. If there are no joining nodes to recover, none recovery will be done. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit c6c4d86cf98d0e669cc947af6e17ff87482e603f Author: Yunkai Zhang Date: Wed Aug 1 15:19:20 2012 +0800 collie: add cluster recover {enable|disable} command = Why we need to disable recovery = After disable recovery, we can add multiple nodes into cluster leisurely, all joining nodes will be kept in an inner temporary list, current_vnode_info will not be chaged. Only one recovery operation will be triggered when enable it again, it can help me to reduce cluster's fluctuation. Recovery will be executed immediately if there are nodes leaving from the cluster even if we have disabled recovery, but the joining nodes kept in the inner temporary list will not participate in this recovery, they will be kept until we enable recovery again. PS: the recovery is 'enable' by default. = Usage = 1) Disable cluster recovery: $ collie cluster recover disable *Note*: Only disable the recovery caused by JOIN envets Cluster recovery: disable 2) Add multiple nodes into cluster ... 3) Enable cluster recovery: $ collie cluster recover enable Cluster recovery: enable This patch only implements the command-line tools to update inner status, next patch will do the real work. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 62cd1b5bffa085602dfefb0f1de700cdbb31cd34 Author: Yunkai Zhang Date: Wed Aug 1 15:19:19 2012 +0800 collie: rename 'collie recover' to 'collie recover force' Rename 'collie recover' to 'collie recover force' so that we can introduce other two recover subcommands: {enable|disable} in the next patch. In order to keep consistent naming, I also renamed some related functions and flags. As we need to express subcommand's subcommand(such as 'recover' is a subcommand of 'collie' and 'force' is a subcommand of 'recover'), I have modified subcommand's data structure and refactored some helper funcions a bit. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit eb6bffe3cc0d9fbfeca373bd43ad0aab9e5c2655 Author: Liu Yuan Date: Thu Aug 2 17:15:33 2012 +0800 object cache: fix memory leak in object_cache_lookup() Signed-off-by: Liu Yuan commit 16f3e9783aa1084dde1de16a9eca666b299a5d77 Author: Liu Yuan Date: Thu Aug 2 11:17:39 2012 +0800 sheep: always route request to cache layer when cache is enabled Signed-off-by: Liu Yuan commit 4835c2d89b6c811a8939e5cfe43895e1eeecb7d5 Author: Liu Yuan Date: Thu Aug 2 09:35:02 2012 +0800 object cache: don't return when we access object during reclaim We can benefit from RCU list that we can always operate on the list without lock Signed-off-by: Liu Yuan commit 8b9000e1768bdfdb8fee9a118404729c882cc24c Author: Liu Yuan Date: Thu Aug 2 09:34:51 2012 +0800 object cache: don't return when node_in_recovery() Because we have self retry mechanism, we are no longer afraid of node failure. Signed-off-by: Liu Yuan commit 6cb3f7b2e9af2f791286256520157d84017a5bab Author: Liu Yuan Date: Thu Aug 2 09:34:45 2012 +0800 object cache: fix add oid to object list race When there are multiple thread calling object_cache_pull(), they all will return success even though there is only one thread that can actually create the object. Then we should only add oid to object list once. Signed-off-by: Liu Yuan commit 102e8412790cb4d8ec1adaed935221476a5dc4a6 Author: Liu Yuan Date: Thu Aug 2 09:34:37 2012 +0800 object cache: fix object_cache_lookup() one of object_cache_lookup() callers, object_is_cached() can be called in main thread, then we shouldn't try to sleep at main thread, otherwise we'll meet a deadlock between threads doing push and main thread. Signed-off-by: Liu Yuan commit bb6bb111f5b25ad5e73d773b96395a1cb16537c8 Author: Liu Yuan Date: Thu Aug 2 09:34:28 2012 +0800 object cache: fix add_to_object_cache() add_to_object_cache() and add_to_dirty_tree_and_list() should be operating atomically, otherwise following race will happen: thread 1 thread 2 create A on object cache add_to_object_cache(A) do_reclaim_object(A) remove_cache_object(A) add_to_dirty_tree_and_list(A) <--- panic! Signed-off-by: Liu Yuan commit 2f94448cd881e67b71fc03eaa1e88ebdf670fa2d Author: Liu Yuan Date: Thu Aug 2 09:34:17 2012 +0800 object cache: simplify reclaim algorithm The old reclaim algorithm tries to push dirty object synchronously, which does lock/unlock mutex dance because push operation is considerably long opration. This dramtically obfuscate the code and logic. We don't actually do this dance because flush opreations is periodically issued from guests in a relatively short window (for e.g, less than 1 minuts in Linux) if there are dirty pages in kernel's memory. That is, in most cases, we'll see more 'clean' objects than dirty objects. The new algorithm is simple yet efficient (similar to Linux kernel's page cache): - only tries to reclaim 'clean' object, which doesn't has any dirty updates, in a LRU list. - spip the object when it is in R/W operation. - skip the dirty object if it is not in push(writeback) phase. - wait on the dirty object if it is in push phase. Signed-off-by: Liu Yuan commit cc13f7ad09546a089b60b1fd7cca08d4cf6a406f Author: Jens Weber Date: Tue Jul 31 17:38:40 2012 +0200 fix long option for --enable-cache and update sheep.8 man page sheep.c: fix "option '--enable-cache' doesn't allow an argument" sheep.8: update --enable-cache option Signed-off-by: Jens Weber Signed-off-by: MORITA Kazutaka commit b8fc341d7035cddce9b2d68787433641b4f67df4 Author: levin li Date: Mon Jul 30 17:34:01 2012 +0800 object cache: try to start reclaim when collie resets the max cache size Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 0cdd67c3df6394576ffaac7aa9d6d0a568144b30 Author: Liu Yuan Date: Mon Jul 30 15:51:56 2012 +0800 logger: log msg in syslog when log area overun We need this to identify if this happens Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 522ed7677c068b07e3ca1da5ebbd18e8ad1d5d11 Author: levin li Date: Mon Jul 30 12:21:20 2012 +0800 sheep: set max cache size when receive SD_OP_SET_CACHE_SIZE Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 1daaa2a7ea2edef8debf42fe4f35192cda27560d Author: levin li Date: Mon Jul 30 15:39:16 2012 +0800 collie: add node cache command to set max cache size Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 672a085aba1c43c3b826ac6a67fd36e94430c063 Author: levin li Date: Mon Jul 30 15:33:02 2012 +0800 object cache: remove RECLAIM flag from reclaim path Since we hold lock while calling find_cache_entry() and reclaiming the cache entry, there's no necessary for RECLAIM flag Signed-off-by: levin li Signed-off-by: Liu Yuan commit 278c74b02f6a95436724458389ea117c60549133 Author: Liu Yuan Date: Mon Jul 30 09:37:56 2012 +0800 object cache: don't panic in object_cache_read/wrie() The caller of object_cache_read/write() can handle the 'NO_CACHE' case gracefully. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 807a38d093bf8d1da271a8c7226ffb8c635b6a20 Author: MORITA Kazutaka Date: Mon Jul 30 02:54:18 2012 +0900 collie: list available stores only when the wrong driver is specified Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 7aa346bfbb50a97020817325a3fb4c2ff862e31a Author: MORITA Kazutaka Date: Mon Jul 30 02:54:17 2012 +0900 sheep: check return values of store.init and store.format Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit acbd2f70c51db266436e540e2a4d2ac6cfab5599 Author: MORITA Kazutaka Date: Mon Jul 30 02:54:16 2012 +0900 farm: check whether extended attributes are available with getxattr On some filesystems (e.g. ext3), listxattr doesn't return an error even if extended attributes are not available. This patch uses getxattr instead of listxattr for the xattr check. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit c6a3ec418a2189dc32e28657a08767f8fc185870 Author: Matthew Law Date: Sun Jul 29 15:36:02 2012 +0100 Small grammar fixes in collie.8 man page Signed-off-by: Matthew Law Signed-off-by: MORITA Kazutaka commit b3dda7281d21c790bcc50bf11a70501a6df1ba4e Author: Matthew Law Date: Sun Jul 29 15:21:07 2012 +0100 Small grammar fixes and removal of duplicate loglevel line in sheep.8 man page. Signed-off-by: Matthew Law Signed-off-by: MORITA Kazutaka commit 98241c50d23d99047c37a606a845f5f0bccbbf50 Author: Yunkai Zhang Date: Sun Jul 29 03:42:45 2012 +0800 sheep: let all sheeps with smaller epoch added into delayed_nodes list Since sheeps in delayed_nodes list won't cause recovery when the cluster in WAIT_FOR_JOIN state, it's safe to put sheeps with smaller epoch into delayed_nodes list, regardless of whether it once belonged to the cluster. Benifit from this change, we needn't to restart sheep in the following scenario: 1) Start [0,1,2,3] sheeps: epoch of [0,1,2,3] sheeps = 1 2) Kill [0] sheep, and then Shutdown [1,2,3] sheeps epoch of [0] sheep = 1 epoch of [1,2,3] sheeps = 2 3) Start [1,2] sheeps: epoch of [0] sheep = 1 epoch of [1,2,3] sheeps = 2 cluster status = WAIT_FOR_JOIN (waits [3] sheep) 4) Start [0] sheep: [0] sheep will be added into delayed_nodes list, needn't to restart epoch of [0] sheep = 1 epoch of [1,2,3] sheeps = 2 cluster status = WAIT_FOR_JOIN (waits [3] sheep) 5) Start [3] sheep: epoch of [0,1,2,3] sheeps = 3 cluster status = OK Now cluster start working... Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 64233f7c16d6eafc5b981e5e5405923dfbd99b10 Author: levin li Date: Fri Jul 27 20:01:22 2012 +0800 object cache: add a object_list for cache entry for cache deleting When deleting an entire VDI cache, a object list is easy for traversing comparing to a rb-tree Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 99ef884500f2e4b1afc2941a28e78e6c20a9bcb1 Author: levin li Date: Fri Jul 27 20:01:21 2012 +0800 object cache: refactor object_cache_remove() Since all the cache entry are not stored in memory, we can not only remove entry from dirty tree/list, we should also remove it from the object tree/list. Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit a29aac84e5085dc545bab9cfdaeaa405fe7392f0 Author: levin li Date: Sun Jul 29 10:17:32 2012 +0800 object cache: reclaim cached objects when cache reaches the max size This patch do reclaiming work when the total size of cached objects reaches the max size specified by user, I did it in the following way: 1. check the object tree for the object entry to determine whether the cache entry is exist and whether it's reclaiming, if it's reclaiming we make sheep ingore the cache. 2. In object_cache_rw() we search the cache entry, after passed the sanity check, we increment its refcnt to tell the reclaiming worker that this entry is being referenced, we should not reclaim it now. 3. In add_to_object_cache(), when the cached size reaches the max size, we start a reclaiming thread, only one such thread can be running at one time. 4. In reclaim_work(), we reclaim cached objects until the cache size reduced to 80% of the max size. 5. In reclaim_object(), we start to reclaim an object, before this, we check that if the cache is flushing, we don't reclaim it, and if the refcnt of the object is not zero, we also don't reclaim it. If the cached object is dirty, we flush it by push_cache_object(), and then try to remove the object. Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit d1c507ae970fcf66139cf37df5f8b069802baa04 Author: levin li Date: Fri Jul 27 20:01:19 2012 +0800 object cache: schedule the object cache in a lru list We put all the cached object into a global lru list, when the object cache is referenced(read/write), we move the object to the head of the lru list, then when cache reaches the max size we can reclaim it from the end of the lru list. Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit f4ffee3095a14cb6f98cd32f1a1a7b30a0e75d40 Author: levin li Date: Fri Jul 27 20:01:18 2012 +0800 object cache: use rwlock to replace mutex lock for per-vdi cache Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit dbc0ba5a6b8a0a6cacde89b14a2504cf5f85ee30 Author: levin li Date: Fri Jul 27 20:01:17 2012 +0800 object cache: merge active and inactive dirt_tree/list Since we will share the same entry in both object_tree and dirty_tree, it would make thing more complicated to use two dirty tree/list, so merge them together, and use lock when flushing. Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit d35254d65569ccc3d54413fb68ced9478142aaf1 Author: levin li Date: Fri Jul 27 20:01:16 2012 +0800 object cache: add object cache tree for every VDI Add object cache tree for every VDI to keep track of all the objects cached by the VDI, for the reclaiming work. When sheep starts, we should also read the cached objects in disk which is created by the previous running, otherwise, these cache objects may cause a disk leak. Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 9f71bdc81c0558d74cf5328137e15610a1eae94c Author: levin li Date: Fri Jul 27 20:01:15 2012 +0800 sheep: use cmd argument -w to specify a max cache size Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 7de0a820b50ebd821ac5775bc43daa6008e0a5a7 Author: Liu Yuan Date: Fri Jul 27 17:58:35 2012 +0800 collie: add more error msg for send_light_req() - also move it out of lib/net.c since only collie use it, thus we can use fprintf(stderr, ...) inside to indicate which phase to fail. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit e049feebf58fbd8e7a8ae07bb722fd4625f77a63 Author: Yunkai Zhang Date: Fri Jul 27 14:38:17 2012 +0800 collie: cleanup callbacks of collie command when they send header only requests There are several callbacks of collie command send requests which only contain header. So let's abstract the common part of the them into a new function: send_light_req() which could make code more compactness. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 2ebc23bfbebb3887c6964d4ea9a52f0e347c931b Author: Yunkai Zhang Date: Wed Jul 25 22:37:56 2012 +0800 zookeeper: the joiner also needs to create member path when master transfer Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 236b30b1b52bbe891e2bf47c4824f9b5445a2851 Author: levin li Date: Wed Jul 25 20:15:17 2012 +0800 configure: add micro _LGPL_SOURCE to make sheep use liburcu Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit bd5764c6ded20ce0f2a0cfe9457cae7b69cf1a45 Author: MORITA Kazutaka Date: Wed Jul 25 01:59:21 2012 +0900 cluster/corosync: avoid using CPG_* error codes It seems that CPG_* error codes are not defined in corosync 2.0. This patch uses CS_* instead of them. Signed-off-by: MORITA Kazutaka commit 37d875d71e25f153644822fdcb08f2e65752dd15 Author: Jens Weber Date: Tue Jul 24 14:50:54 2012 +0200 add node kill option collie.8 man page * This patch add the new collie node kill option to collie.8 man page Signed-off-by: Jens Weber Signed-off-by: MORITA Kazutaka commit 439c9899e71e1d8fb11e881d7604eb3ff4587162 Author: Liu Yuan Date: Tue Jul 24 11:19:20 2012 +0800 sockfd cache: grow fds count dynamically This will scale sheep daemon to serve more VMs on one node and do it adoptively and automatically. - fd count default to 16 instead of previous 8 Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c6f98bfec8cef7aca9be46a0c55d36f622682781 Author: MORITA Kazutaka Date: Tue Jul 24 00:13:20 2012 +0900 fix rpm build error Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit accada7b4a9c860d34daddae33261f9114af2ee5 Author: MORITA Kazutaka Date: Tue Jul 24 09:56:30 2012 +0900 configure: make sure that pkg-config is available Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 0a404086c05d8e443b6704ae09d9af7643d61844 Author: Liu Yuan Date: Tue Jul 24 09:50:55 2012 +0800 sockfd cache: fix a comment typo Signed-off-by: Liu Yuan commit cbf9a9503ffda2b0631e84cdd2ea4380adaeecad Author: Liu Yuan Date: Tue Jul 24 09:50:48 2012 +0800 gateway: rename wait_forward_write to wait_forward_request This is not write specific only for now. Signed-off-by: Liu Yuan commit 02fd7d7c2f4f280b093a7c1b8620eeaed9bcf9fc Author: Liu Yuan Date: Mon Jul 23 17:18:43 2012 +0800 sheep: shutdown gracefully when we receive SIGTERM Signed-off-by: Liu Yuan commit b8e529458b564d25365a8a692b6189d869b99947 Author: Liu Yuan Date: Mon Jul 23 17:18:43 2012 +0800 trace: move init_signal to trace_init_signal This is preparation for signal_handler patch Signed-off-by: Liu Yuan commit d31c727d85cdb1e19fe26541f6f08612e72f2f19 Author: Liu Yuan Date: Mon Jul 23 17:18:41 2012 +0800 sheep: add a kill node operation This command is supposed to shut down the specified node gracefully. usage: $ collie node kill node_id Signed-off-by: Liu Yuan commit 95562ce979ac1ed325362a270ed4de8743d13d9d Author: Yunkai Zhang Date: Mon Jul 23 16:54:16 2012 +0800 sheep: fix compare statement in sd_check_join_cb Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit a03cb43559b3e2fc70f959b4c3ce9d147cccc931 Author: Christoph Hellwig Date: Thu Jul 19 12:47:19 2012 -0400 sheep: always access sys->status directly A lot of access already need to be direct, e.g. for using switch statements. Change the remaining instances to also directly access it and remove the wrappers for it (and sys_flag_nohalt). sd_leave_handler also got another additional cleanup by using a switch statement and running the nr_zones check (uselessly but cleaner) for the halt state as well so that it matches the join side. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 6c8a02eeb4004f7e04e69d6336e538ee69516950 Author: Christoph Hellwig Date: Wed Jul 18 07:41:10 2012 -0400 sheep: refactor update_cluster_info Use switch statements to make the code cleaner, and make sure we don't perform any actions for a shut down cluster. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit cc0a1dabf131b8907794e4fc1f86a38e88466481 Author: Christoph Hellwig Date: Wed Jul 18 07:41:09 2012 -0400 sheep: fix cluster_flags handling Currenly a sheep not present during format time does not pick up the cluster flags. Fix this and also add a cluster_flags validity check in cluster_sanity_check. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 98885bcb22f8a05ab8eb06835884cb0c4693f2a6 Author: Christoph Hellwig Date: Wed Jul 18 07:41:08 2012 -0400 sheep: use stored nr_copies and flags For a sheep that is restarted we can trust the local nr_copies and cluster flags stored in the config file, so read it from there if is present and only pick up the values from the join message if the sheep does not have a configuration yet. Also change the type of the nr_copes value in struct cluster_info so that get_cluster_copies can be used on it without warnings. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 2a218d87d65c8ab4269dbd7c30cfbed2bdf456fb Author: Christoph Hellwig Date: Wed Jul 18 07:41:07 2012 -0400 sheep: check nr_copies in cluster_sanity_check Make sure a newly joining node has the correct number of copies. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 226f57fb73e528813ba864b83b2c05c5841b52f8 Author: Christoph Hellwig Date: Wed Jul 18 07:41:06 2012 -0400 sheep: refactor sd_check_join_cb By handling the trivial master selfelection separately we can pass down the join_message structure to the various check helpers for the main path and remove the get_cluster_status helper. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit d7e7a34ad706526754615a5c0e22c7126f43646b Author: levin li Date: Mon Jul 23 13:20:25 2012 +0800 sheep: add name for SD_OP_NOTIFY_VDI_DEL Signed-off-by: levin li Signed-off-by: Liu Yuan commit 3e7703969b88d064dc401d8f441d790fc30b9db2 Author: levin li Date: Thu Jul 19 10:19:11 2012 +0800 farm: comment why we can't remove objlist entry while deleting object Signed-off-by: levin li Signed-off-by: Liu Yuan commit a3dc85d85b6ff91fc1e5bd1b82d5b170e83eff0c Author: levin li Date: Thu Jul 19 10:19:10 2012 +0800 object list cache: reclaim object list cache when receiving a deletion event. Before reclaiming the cache belonging to the VDI just deleted, we should test whether the VDI is exist, because after some node delete it and before the notification is sent to all the node, another node may issus a VDI creation event and reused the VDI id again, in which case we should reclaim the cached entry. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 650f04f1f9c2cf5f0cdf5895be73f07b1e854cbb Author: levin li Date: Thu Jul 19 10:19:09 2012 +0800 object list cache: put all the cache entry into a list Compared to rb-tree, putting the entry into a list makes it easy to traverse and reclaim. Signed-off-by: levin li Signed-off-by: Liu Yuan commit c42c318a62e4f6a80ef749fc16ceb3b955df9c99 Author: levin li Date: Thu Jul 19 10:19:08 2012 +0800 sheep: notify VDI deletion to all nodes when deleting a VDI After deleting a VDI, we should notify the VDI deletion event to all the other nodes to make them clear up the object list cache. Signed-off-by: levin li Signed-off-by: Liu Yuan commit b72279aa67be99978f13fd6f2ac0bee91a88d9b8 Author: Dietmar Maurer Date: Thu Jul 19 12:23:34 2012 +0200 use have_enough_zones() instead of inline expresion To make it work with new quorum mode. Signed-off-by: Dietmar Maurer Signed-off-by: Liu Yuan commit 9d4c1ddc8f19bcd8a9cb2263f1488cf36c1f4b8d Author: Jens Weber Date: Thu Jul 19 20:09:03 2012 +0200 fix collie.8 man page - replace -H option with -m mode option Signed-off-by: Jens Weber Signed-off-by: Liu Yuan commit c9d6731028290006af869534e9c4d4b66902dbfd Author: Dietmar Maurer Date: Thu Jul 19 11:48:46 2012 +0200 add mode option to man page collie.8 Signed-off-by: Dietmar Maurer Signed-off-by: Liu Yuan commit 23168926dc519982992cc6b8f7b5244f7ed15cb2 Author: Liu Yuan Date: Thu Jul 19 17:47:59 2012 +0800 sheep: don't call farm_init() for gateway-only node Gateway-only node doesn't need backend at all, and we can restart it as gateway node (has backend) again. commit 7757b202c4a256c6352e8b5ac0f8c0714845a741 Author: Dietmar Maurer Date: Thu Jul 19 10:08:25 2012 +0200 add early test for xattr support We want to detect that early and prevent startup. Else the daemon starts and simply crash with panic() later. Signed-off-by: Dietmar Maurer Signed-off-by: Liu Yuan commit c90de6c93c1f26ebe545efb391494bab3879c819 Author: Dietmar Maurer Date: Thu Jul 19 09:50:44 2012 +0200 change nohalt flag into a switch to control which mode to run This removes the --nohalt switch, and adds a mode switch --mode safe|quorum|unsafe safe mode: halt cluster when nr_nodes < nr_copies (default) quorum mode: halt cluster when nr_nodes < nr_copies / 2 + 1 unsafe mode: never halt the cluster. Signed-off-by: Dietmar Maurer Signed-off-by: Liu Yuan commit 4adcee52bb1f4501e181c6b7c39e11a2727ef134 Author: Liu Yuan Date: Thu Jul 19 14:51:38 2012 +0800 sheep: rename sdnet.c to request.c for sheep itself, sd as a file name means nothing. We can have a more consistent file naming with request.c Signed-off-by: Liu Yuan commit c2a18418e405c8aab884e3cb8e0888530dea72d4 Author: Liu Yuan Date: Thu Jul 19 14:50:38 2012 +0800 net: refactor client_tx_handler() Signed-off-by: Liu Yuan commit 905ff13556fe7c7de5225ae21ab7925bee6ab415 Author: Liu Yuan Date: Thu Jul 19 14:49:38 2012 +0800 net: refactor client_rx_handler() Signed-off-by: Liu Yuan commit 9d78f698ef2137609d33f947254f59b9a20595d7 Author: Liu Yuan Date: Thu Jul 19 14:48:38 2012 +0800 net: clean up client_tx_handler() - add some comments Signed-off-by: Liu Yuan commit 7c8f409fecbb9e7c6817e227621a91ffa9fb9226 Author: Liu Yuan Date: Thu Jul 19 14:47:38 2012 +0800 net: clean up clear_client_info() usage We should call clear_client_info() as soon as we find the connection is dead. Signed-off-by: Liu Yuan commit 2942d169f22ab4aadb97ab64985090c90d187859 Author: Liu Yuan Date: Thu Jul 19 14:46:38 2012 +0800 net: clean up clear_client() - rename it to clear_client_info() - move dprintf in it instead of call dprintf every time after clear_client() Signed-off-by: Liu Yuan commit cb09c01c4f9425e13ac7884f388d85769b0d38ef Author: Liu Yuan Date: Thu Jul 19 14:45:38 2012 +0800 net: remove request throttling Request throttling is introduced to tackle OOM prolems when requests were very easily accumulated because of very limited worker threads. Now with short threads construct, we will process any request ASAP and requests won't occupy as many memory as before in a short period. So it's time to remove this throttling because they are never reached in reality. Signed-off-by: Liu Yuan commit a394c4de82b5e9fb42463464eca16170108e43bc Author: Yunkai Zhang Date: Thu Jul 19 11:45:07 2012 +0800 zookeeper: fix cluster hang by giving priority to process LEAVE event As cluster request may retry infinitely when some sheeps left, than cluster_op_done could not to be called forever, so it will cause cluster hang problem. By giving priority to process LEAVE event when there is unfinished BLOCK event, we can fix this issue, but also comply with the rule which is very important for distributed system I think: All sheeps should process all events in the same order. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 343d4164cafc7f3acb081e6b30bb169f80d98d2a Author: Jens Weber Date: Wed Jul 18 15:24:12 2012 +0800 add pidfile option in man page sheep.8 Signed-off-by: Jens Weber Signed-off-by: Liu Yuan commit 5b27736abffc3fe569b4ec5335806ad345e279b9 Author: Christoph Hellwig Date: Tue Jul 17 13:16:11 2012 -0400 remove duplicated check in get_nth_node The same node is per defintion also in the same zone, so remove the duplicate check. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit e840a82adeadb55c659b89c75c2c0c685cdf3470 Author: Dietmar Maurer Date: Wed Jul 18 06:29:16 2012 +0200 fix manpage section Lintian complains about wrong manpage section. This patch fixes the issue. Signed-off-by: Dietmar Maurer Signed-off-by: Liu Yuan commit e6448ec27e5f21e9a7a8b76685ebd36a693271a2 Author: Dietmar Maurer Date: Tue Jul 17 14:17:52 2012 +0800 sheep: add option to create pid file This is useful for init.d scripts. Signed-off-by: Dietmar Maurer Signed-off-by: Liu Yuan commit 0618e6553faeb1d1bf6e34b4574192ab37ccdb72 Author: Liu Yuan Date: Tue Jul 17 09:56:52 2012 +0800 recvoery: don't do recovery when it's a gateway only node Reviewed-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 9f4c4e51552a37c84d9b68097b50e44bdd446805 Author: Christoph Hellwig Date: Mon Jul 16 05:13:27 2012 -0400 sheep: merge object_cache_rw into object_cache_handle_request Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 1ba922364f07541c52af5a529579ccd4fd5b8d9b Author: Christoph Hellwig Date: Mon Jul 16 05:13:26 2012 -0400 sheep: simplify object_cache_write Just call write_cache_object directly instead of going through the effort of faking up a request first. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 6d4f858fa9ece49b893fac502bd649f9ce859f1e Author: Christoph Hellwig Date: Mon Jul 16 05:13:25 2012 -0400 sheep: simplify object_cache_read Just call read_cache_object directly instead of going through the effort of finding a cache object and faking up a request first. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit f37984983ad4fd3a8d70b196cde67cb2cb265ad7 Author: Christoph Hellwig Date: Mon Jul 16 05:13:24 2012 -0400 sheep: factor cache update out of object_cache_rw We'll grow another copy of this code soon, so move it into a separate function in preparation of that. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit eb3c27e54fe5fc6f2f7f60f1e278d691258a1cb5 Author: Christoph Hellwig Date: Fri Jul 13 05:52:14 2012 -0400 sheep: log a message when retrying requests Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 9b9df35bcb83ee0e59a464a9da0f17046f3a3c25 Author: Christoph Hellwig Date: Mon Jul 16 04:44:11 2012 -0400 sheep: improve cluster request debugging Add a few more printfs, print the symbolic command name where available. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit f75b3e8042b00d8227f7e1b050eb72390bb5833c Author: Christoph Hellwig Date: Fri Jul 13 05:52:12 2012 -0400 sheep: print symbolic command names Printing the commands as plain text instead of an opcode makes debugging a lot easier. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit cd7255d1b56fd97208bdeadcaa60a89941008ba3 Author: Jens Weber Date: Sun Jul 15 14:27:11 2012 +0200 script: fix simple2farm unary operator error Fit this: fix simple2farm: line 31: [: ==: unary operator expected Signed-off-by: Jens Weber Signed-off-by: Liu Yuan commit 6c41a618f346c56ba1d0bdcd698158908394ae8e Author: Liu Yuan Date: Thu Jul 12 16:26:34 2012 +0800 gateway: init forward hrd in gateway_forward_request() - add a helper to init hdr - add a map table to mape gateway opcode to peer opcode Signed-off-by: Liu Yuan commit 2f8dfbdfc3196572f619b12fcae55f9c4bf6e937 Author: Liu Yuan Date: Thu Jul 12 16:25:34 2012 +0800 gateway: refactor write and remove functions They share almost the same logic, so let's extract the core out and wrap it a stand-alone function. - add sheep_do_op_work() - opencode do_gateway_write_obj() Signed-off-by: Liu Yuan commit 5d81c7ec01f64ca109171ad9424d435e1db9b3f9 Author: Liu Yuan Date: Thu Jul 12 16:24:34 2012 +0800 sheep: built-in two std headers for util.h Without, we have to include these two headers every time in source file to avoid compile error. Signed-off-by: Liu Yuan commit 6787553213f99dde6de271b52f09d06730c0e8f4 Author: MORITA Kazutaka Date: Thu Jul 12 18:09:37 2012 +0900 sheepdog 0.4.0 Signed-off-by: MORITA Kazutaka commit d61c62760933df122ecd0636e6f4d00d38087398 Author: Liu Yuan Date: Thu Jul 12 16:23:34 2012 +0800 farm: remove unused zlib.h header We don't need it at all for now Signed-off-by: Liu Yuan commit e12b56dfdb01d3ae8059456f179cdb688ee2783e Author: Liu Yuan Date: Thu Jul 12 16:23:11 2012 +0800 configure: disable tracer for 0.4.0 release Current tracer can't work with fully dynamic(short) worker threads due to its per thread ring buffer implementation, as it can in fact only work with static threads. Some of its infrastructure needs rewriting to adopt it to short threads. For now let's disable it. Signed-off-by: Liu Yuan commit b9a8631e82786ba76bb4c9fa9721392810b85e5b Author: MORITA Kazutaka Date: Thu Jul 12 09:51:40 2012 +0900 configure: exit if --enable-trace is specified on non x86_64 architectures Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 74bfe9a33f216242eb1ec531ea5ae13841760564 Author: MORITA Kazutaka Date: Thu Jul 12 09:51:39 2012 +0900 configure: exit if --enable-trace is used with debug options Sheepdog tracer assumes that inline functions are inlined, so we cannot use it with the gcc -O0 option. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 5530186842fbb34e753275cd37a56a02f5f5feb7 Author: MORITA Kazutaka Date: Thu Jul 12 09:51:37 2012 +0900 work: fix exit_work_queue compile error Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit ced6f68154da6348d6d1e3426ae492614148e13f Author: MORITA Kazutaka Date: Thu Jul 12 09:51:36 2012 +0900 configure: remove farm option farm is the only storage driver now, so we cannot disable it. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit b77bd0340b9f63ad709f34ac2191ee16e037e0cc Author: MORITA Kazutaka Date: Thu Jul 12 09:51:35 2012 +0900 sheep: fix type warnings on 32 bit machines Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 424fd93360c77fb55de291500d317e9d5bed2cb4 Author: levin li Date: Tue Jul 10 17:02:02 2012 +0800 farm: add flock in read_working_object() We should lock the object when read it in read_working_object just as we did in farm_read Signed-off-by: levin li Signed-off-by: Liu Yuan commit 00309be270a12b444257898ca8984e2d370f4ac2 Author: MORITA Kazutaka Date: Wed Jul 11 02:37:10 2012 +0900 cluster/corosync: fix double call to sd_block_handler After corosync_unblock() is called and before the unblock message is delived, sheep can perform the same block event which was previously peformed. It is because cluster_op_running is false during the time. This fixes the race condition. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit a1c07b23bc19646ec7e0597d9b97940eb83be594 Author: Christoph Hellwig Date: Tue Jul 10 12:42:25 2012 -0400 local: return false for all blocking events in local_process_event This fixes a regression in "cluster/local: process multiple events in local_handler" after that patch we may busy loop under the shared memory area lock when waiting for a join request to be answered, or when another blocking even arrives while one is already beeing processed. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 21ee9cc0425d861f463e0b88e41159cfcb21cbad Author: MORITA Kazutaka Date: Tue Jul 10 17:17:13 2012 +0900 use GPL sha1 code instead of openssl This uses GPL sha1 code (taken from open-iscsi) instead of OpenSSL to avoid OpenSSL and GPL license incompatibility issue. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 399bfe09fe2cc652ce43270023ea0288fceebf48 Author: Yunkai Zhang Date: Mon Jul 9 20:50:18 2012 +0800 sheep: fix clear_client crash We should use list_for_each_entry_safe instead of list_for_each_entry to loop when we need to delete node in it. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 7ac197bbf3be9015d983ea5570dddb965c3c67bb Author: Yunkai Zhang Date: Mon Jul 9 10:21:52 2012 +0800 zookeeper: increase nr_zk_nodes only when added into btree successfully Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit fbc8f4198bfc3c120873b32214e144b24009418f Author: YAMAMOTO Takashi Date: Mon Jul 9 15:29:20 2012 +0900 doc: comment what simple store is Signed-off-by: YAMAMOTO Takashi Signed-off-by: Liu Yuan commit f2d5bd4347049f2b5af97f2c71b50051ddee0fd7 Author: YAMAMOTO Takashi Date: Mon Jul 9 15:29:19 2012 +0900 doc: fix a typo Signed-off-by: YAMAMOTO Takashi Signed-off-by: Liu Yuan commit 471a6d7521b92966947bf832a9ced7b15413e3c6 Author: MORITA Kazutaka Date: Sun Jul 8 13:10:00 2012 +0900 update bash completion file for collie Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 66b7b577656d94ab9a2bf386c97ad00f6949357c Author: MORITA Kazutaka Date: Sun Jul 8 15:36:29 2012 +0900 farm: fix support for vdi attribute objects Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit f29a6cc2358865483bbc179bfff305a5d7fa9886 Author: MORITA Kazutaka Date: Sun Jul 8 15:34:51 2012 +0900 add a helper function to calculate the max size of objects This prepares for the next patch. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 16c0459c4008af7b14d9eaf5da2dc6ef83311172 Author: MORITA Kazutaka Date: Sun Jul 8 13:39:15 2012 +0900 collie: avoid setting proto_ver outside sd_init_req This fixes a version mismatch problem of collie vdi getattr and setattr commands. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit ed0f5b01f2a5772cba491eb220172344fc646d58 Author: MORITA Kazutaka Date: Sun Jul 8 12:16:37 2012 +0900 sheep: increment req->refcnt correctly in object deletion We need to increment the reference counter even if the deletion work doesn't start immediately. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 4c4b0160e95ccf90f6324ca677121536c8da33c2 Author: MORITA Kazutaka Date: Sun Jul 8 10:42:21 2012 +0900 cluster/local: process multiple events in local_handler Currently, the local driver doesn't retry performing the blocked events after they are unblocked. This patch fixes the problem. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit e36e5da2ceed30f64b89bdc7f13d33a5d16e1cc2 Author: YAMAMOTO Takashi Date: Fri Jul 6 17:19:36 2012 +0900 man: fix some typos This patch is based on the github pull request from Takashi. https://github.com/collie/sheepdog/pull/31 Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit cc458b9a0cab05bef50a7dae6dcc3c8e1cabe6ee Author: Yunkai Zhang Date: Fri Jul 6 21:30:17 2012 +0800 sheep: free all requests when connection is dead It's important to free all requests when connection is dead(especially when EPOLLERR occur, some requests might be not cleared correctly), or it will lead to memory leak and sys->nr_outstanding_reqs would be always larger than 0 as a result sheep could not be shutdown. Conneciont's request is created by alloc_request() and will be kept in four places: 1) ci->rx_req -- after alloc_request() 2) inflight -- after reset ci->rx_req and queue_request() 3) ci->dones_reqs -- after put_request() added to ci->dones_reqs list 4) ci->tx_req -- after init_tx_hdr() and assigned to ci->tx_req When request is inflight, ci->refcnt will always be larger than 0, but it will be added to ci->dones_reqs after work_fn finished. As long as we can promise that one request only be kept in one of these four places, than we can free it correctly. This patch do this clear work in clear_client(). BTW: add and update some log info Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 7ce70486c94ddea1740894bfe2c09a9e97ff6cd7 Author: Yunkai Zhang Date: Fri Jul 6 21:30:16 2012 +0800 sheep: simplify client_decref() and move it into free_request() and add a helper function 1) In previous code, sheep calls client_incref() in alloc_request(), but free_request() does not call client_desref() in it, as a result it's difficult to keep ci->refcnt with correct value. Now I drop client_incref/client_decref and call ci->refcnt++/ci->refcnt-- in alloc_request/free_request directly. 2) A bug in put_request(): before calling client_decref(), we should do some clear actions like client_handler(). 3) ci->refcnt should only be increased by alloc_request(), let's initialize it with 0 in create_client(). 4) remove error message in unregister_event() when lookup_event() failed, as this function may be called several times in new helper function which named clear_client() before ci->refcnt reach zero. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit aa1cab42b0415e60fe8e1b7e3c0400bc5a3374c3 Author: Christoph Hellwig Date: Fri Jul 6 07:52:07 2012 -0400 add strict versioning Enforce that all internal commands have the SD_SHEEP_PROTO_VER protocol version, and any external command that has a protocol version has the right (0x01) one. Also bump SD_SHEEP_PROTO_VER to 0x05 for the upcoming 0.4.0 release which has lots of internal protocol changes. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 9b63c7dbb1865126ac52ec02025ccfea736c2e31 Author: Christoph Hellwig Date: Fri Jul 6 07:52:06 2012 -0400 renumber local flags and errors to be above 0x80 Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 3f981089d1608547d586c42dfee8c88228ec97f7 Author: Christoph Hellwig Date: Fri Jul 6 07:52:05 2012 -0400 remove the unused SD_FLAG_CMD_WEAK_CONSISTENCY flag Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 3f27a812757a2f91f161c5c5660f2a407640364b Author: Christoph Hellwig Date: Fri Jul 6 07:52:04 2012 -0400 remove the unused SD_OP_GET_VM_LIST and SD_OP_KILL_NODE opcodes Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit c3628ca032d182ec5d186ad4995dd7e49c14c449 Author: Christoph Hellwig Date: Fri Jul 6 06:03:48 2012 -0400 corosync: fix comparism in event_is_confchg event_is_confchg needs to check for COROSYNC_EVENT_TYPE_NOTIFY instead of COROSYNC_MSG_TYPE_NOTIFY as it operates on the higher level events. Right now we never search for a notify event so it's not a problem in practice. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 982d5ab43e7c2607596e9065e931d92cae3a91e9 Author: Liu Yuan Date: Fri Jul 6 00:46:52 2012 +0800 corosync: fix cluster hang by cluster requests blocking confchg This hang is caused by cluster request (add new vdi): 1) cluster request blocks the cluster and wait its worker to finish. 2) a confchg happens, but is queued after this cluster request. 3) cluster_request_fn() issues write request but always fail because of one node failure and retry for ever. 4) cluster_request_done() is never called, so we can't unblock the event list The fix is use separate list for notify and confchg event. This can be reprodced reliably by following script: ================ for i in `seq 0 7`; do sheep/sheep -d /home/tailai.ly/sheepdog/store/$i -z $i -p $((7000+$i));done sleep 1 collie/collie cluster format -c 3 echo create new vdis ( for i in `seq 0 40`;do collie/collie vdi create test$i 4M done ) & echo kill nodes sleep 1 for i in 1 2 3 4 5; do pkill -f "sheep/sheep -d /home/tailai.ly/sheepdog/store/$i -z $i -p 700$i";sleep 1;done; for i in `seq 1 5`; do sheep/sheep -d /home/tailai.ly/sheepdog/store/$i -z $i -p $((7000+$i));done echo wait for object recovery to finish for ((;;)); do if [ "$(pgrep collie)" ]; then sleep 1 else break fi done ================= Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit f8bb4e61f9e771c02ece2b50e0d6868cc4518b59 Author: levin li Date: Thu Jul 5 15:37:26 2012 +0800 sheep: fix a bad return value check in jrnl_recover Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 3d0e84afd59e70255bbc3967024df24682334715 Author: levin li Date: Thu Jul 5 15:21:53 2012 +0800 sheep: read journal head first in journal_recover Sheep doesn't read the journal head in journal_recover which always leaves all the fields of journal head uninitialized, so the target_path is something random. Also we use xpread/xpwrite to get rid of short read/write. Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 0afb3f6b9d4444a72755c4dc0546574456431725 Author: Liu Yuan Date: Wed Jul 4 17:39:43 2012 +0800 sdnet: use xzalloc for alloc_local_request() If we can't afford NULL req, we should panic out. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 9a94ac9a6fde0d07c5d81a16e61294d476757dec Author: Christoph Hellwig Date: Wed Jul 4 10:14:44 2012 -0400 sheep: fix object deletion The SD_OP_REMOVE_OBJ command needs the same gateway handling as the write commands, namely we need to find all sheep daemon that have the OID and then forward the remove request to each of them and issue a peer remove request. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 217460f2fedea82b58bcd3cb8fb9e560f6e6f6ae Author: Christoph Hellwig Date: Wed Jul 4 10:14:43 2012 -0400 sheep: use a common process_work handler for all requests Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 3ff661a66a4a2cadf9be753f371813cdc929da01 Author: Christoph Hellwig Date: Wed Jul 4 10:14:42 2012 -0400 sheep: move tgt_epoch handling into queue_io_request This is a preparation for a common work.fn handler for all requests. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 446d07e0b3b54465258b3604b7b353f9ed3f7c46 Author: Christoph Hellwig Date: Wed Jul 4 10:14:41 2012 -0400 sheep: use ->process_work for gateway requests Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 86f5426f9debc7f003c8eee3b489f58b6b67575f Author: Christoph Hellwig Date: Wed Jul 4 10:14:40 2012 -0400 sheep: move object cache handling into forward_read/write_obj_req This is a preparation for removing do_gateway_request. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 0bb5faaa57c6822759fe9a8887ad20c388fe9a95 Author: Christoph Hellwig Date: Wed Jul 4 10:14:39 2012 -0400 sheep: use different opcodes for internal read/write I/O Using different opcodes instead of the SD_FLAG_CMD_IO_LOCAL flags allows to completely separate the internal protocol from the client facing one. The only complication is that we can't use do_process_work for I/O on local objects now, but need to invoke the methods directly. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 69b8c41601cf5cb33b3072e4e5caa977348206f9 Author: Christoph Hellwig Date: Wed Jul 4 10:14:38 2012 -0400 sheep: remove do_local_io The separate passing of epoch is rather confusing and even obsfucates the only real user of this code. Also simply the control flow in forward_read_obj_req around the call do do_local_io / do_process_work. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 7c62b6e935b1943c57139a00d1b7d322c8a9c521 Author: Liu Yuan Date: Tue Jul 3 19:16:43 2012 +0800 man: update sheep.8 for cache control Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c07935907e10ca64e4efae883520394d372e35f0 Author: Liu Yuan Date: Tue Jul 3 18:42:27 2012 +0800 work: make short thread DETACHED This is need to release pthread resource. Reviewed-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 5b710225269334ff7b2cc632e39d5e3b81604f17 Author: Liu Yuan Date: Tue Jul 3 12:14:59 2012 +0800 work: fix memory leak of shor thread creation - also use uatomic_add_return() for short thread idx Signed-off-by: Liu Yuan commit 86ac6156b23c9840f6c4645f81f326f48623aa2a Author: MORITA Kazutaka Date: Tue Jul 3 03:32:16 2012 +0900 logger: don't show a thread index for an ordered work queue [ Fix extra para in logger and remove changes for work.c - Liu Yuan ] Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 907974ff6f4024b6967487dc4b5b25ae702e3188 Author: MORITA Kazutaka Date: Tue Jul 3 03:32:15 2012 +0900 work: clean up workqueue Currently, wi->nr_threads is always 0 or 1, so we can simplify the code. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit e3f8578f62612883a83338f5fa4e98e4e9c9acdf Author: MORITA Kazutaka Date: Tue Jul 3 03:32:14 2012 +0900 update sheep manpage Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit e846fca755db1e9d8cc7b3808123166b792ca57f Author: MORITA Kazutaka Date: Tue Jul 3 03:32:13 2012 +0900 sheep: change the option of the gateway mode from 'G' to 'g' 'g' is no longer used, so let's use it for the gateway mode. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 1ea95e11a36c494e4e8049355d5a75e8795e06f5 Author: MORITA Kazutaka Date: Tue Jul 3 03:32:12 2012 +0900 sheepfs: fix a compile error We need to include sheepdog_sheep.h before sheep.h. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 1308866666abc14ed3a7d52513f9c3478805140e Author: Liu Yuan Date: Mon Jul 2 17:55:04 2012 +0800 sheep: disable object cache as default Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 6e559a8733a8b5a4a0bc1cfe05d1bd1a9ea4708e Author: Christoph Hellwig Date: Mon Jul 2 03:13:27 2012 -0400 add a header for the sheepdog-internal protocol Add a header to separate out the sheepdog-internal (sheep & collie) protocol from implementation details in the source files. This is the first step towards adding separate versioning for it. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 9c9364dee4c7fceca20acc98cadd092406c7955d Author: Liu Yuan Date: Fri Jun 29 18:46:30 2012 +0800 sheep: introduce 'short thread' to worker threads This patch introduces *short thread* abstraction that is created on demand and destroyed after serving the work for gateway or io requests, aiming to solve two problems: 1. timeout of IO requests from guests. With on-demand short threads, we guarantee that there is always one thread available to execute the request ASAP. 2. system halt for corner cases that all gateway and io threads are executing local requests that ask for creation of another thread to execute the request and sleep wait for response. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 29bfdbd6a95fdf8d827e177046dbab12ee342611 Author: Liu Yuan Date: Sun Jul 1 13:38:22 2012 +0800 farm: reset trunk state after purge_obj() - also panic out when we can't purge objects. Signed-off-by: Liu Yuan commit 53aca84fef49685187ccde5138033ead313946d0 Author: Liu Yuan Date: Sun Jul 1 13:38:14 2012 +0800 recovery: don't schedule object when the targeted object is not recoveried Without this check, it will lead to duplicate oids when the corner case happens Signed-off-by: Liu Yuan commit ecf44b450a4f72b3b65239da33381cb96951a977 Author: Liu Yuan Date: Fri Jun 29 21:22:06 2012 +0800 sheep: fix forward_read_obj_req() We shouldn't reset the header if it is the last try or the response will get a wrong err code in failure. Signed-off-by: Liu Yuan commit eed1fc9d390681160be0fe1c687b493a21a18a37 Author: Liu Yuan Date: Fri Jun 29 20:49:54 2012 +0800 gateway: add missing '\n' Signed-off-by: Liu Yuan commit 1f4db1b320145ba958c255f11e656ec540f6f2f3 Author: Yunkai Zhang Date: Fri Jun 29 15:26:44 2012 +0800 sheep: try to clear conn.blocking_siblings when EPOLLERR/EPOLLHUP occur Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit bcc9f5073d0b556f9c49e00f72a1aa0616407d1b Author: levin li Date: Fri Jun 29 10:27:20 2012 +0800 sheep: remove nr_outstanding_reqs/outstanding_data_size from local gateway request Local gateway request doesn't allocate memory for data, so outstanding_data_size is useless, nr_outstanding_reqs is also useless here, and as alloc_local_req() is called in worker thread, it may cause race, so this patch remove them from local gateway request path. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 1cfe8a9ae5a4a1b9aa086a1dd2f02673ac58cbb7 Author: Liu Yuan Date: Fri Jun 29 13:35:16 2012 +0800 Revert "sheep: take truns to monitor EPOLLIN and EPOLLOUT events." This reverts commit 6575fdf332276939f006c1624359957558174b63, which forces concurrent requests from QEMU to be unnecessarily serialized. commit 4f4a1ea6437d9bb74c02f3e09ff37f9fd5e3b00c Author: Liu Yuan Date: Fri Jun 29 13:33:21 2012 +0800 Revert "sheep: fix bug that blocking_conn_list could never be processed" This reverts commit 1061e219b71a609c7df71ed693b1336c61e940ba, which forces concurrent requests from QEMU to be unnecessarily serialized. commit 09982c564b2fb2a539c075a9ec53df082e3dd32c Author: Liu Yuan Date: Thu Jun 28 17:58:45 2012 +0800 sheep: clean up exec_local_req() Since sd_req has length field, we don't need data_len arg in exec_local_req() Signed-off-by: Liu Yuan commit 0cce7cdc9dd5e1e9e5add6b1afb528dc1f1c9701 Author: Liu Yuan Date: Thu Jun 28 17:57:29 2012 +0800 sheep: clean up nr_copies usage For now, only forward_read/write_obj_req() need nr_copies and can get it from vnode info. Signed-off-by: Liu Yuan commit 2a60ceb6192e4c218f4070459c92ddde262af902 Author: Liu Yuan Date: Thu Jun 28 17:22:22 2012 +0800 sheepfs: clean up one sheepfs_pr sheepfs_pr interally print the line nr already, we don't need to print it again Signed-off-by: Liu Yuan commit 22d4227a44862961fb6bd2fc847424404272e843 Author: Liu Yuan Date: Thu Jun 28 17:21:22 2012 +0800 object cache: add a helper to test idx's vdi bit Signed-off-by: Liu Yuan commit 7f3cdfb192fb32bb7eafde8dc986bfabcacc7043 Author: Yunkai Zhang Date: Thu Jun 28 11:54:10 2012 +0800 sheep: reinitialize conn->blocking_siblings after delete it We use list_empty() to check whether conn->blocking_siblings has joined into sys->blocking_conn_list, so it needs to be re-initialized after delete it from blocking_conn_list. Use list_del_init() instead of list_del() can achieve this purpose. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 64a03ada733ca2a0bd4b97ea6441bce52cda8aec Author: MORITA Kazutaka Date: Thu Jun 28 12:41:51 2012 +0900 logger: show thread name when logging to stdout Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 754b3ab85cb7a7dd177990dee59c9b00d18fed94 Author: levin li Date: Thu Jun 28 13:58:43 2012 +0800 sheep: rename req_done to put_request Signed-off-by: levin li Signed-off-by: Liu Yuan commit cbc965f3241902ccc624e5becdc88d11361da93d Author: levin li Date: Thu Jun 28 10:57:13 2012 +0800 sheep: make 'collie vdi delete' wait for response synchronously VDI deletion work should response to client until all the objects have been deleted just as what file system does, this patch makes it call req_done to send back a response until deletion_one_done() has been called. Signed-off-by: levin li Signed-off-by: Liu Yuan commit bc3ae73baae1dc0ddcec2c98a2e9df6cdcaa3a33 Author: levin li Date: Wed Jun 27 22:37:05 2012 +0800 sheep: remove vnode/epoch/copies from vdi path Since remove/read/remove_object don't use nr_copies and epoch any more, so vnode_info/epoch/copies can be removed. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 0597c33edc6b92f2bdc39d3455925095e3a48970 Author: Jens Weber Date: Thu Jun 28 12:14:27 2012 +0800 man: add manpage for sheepfs Signed-off-by: Jens Weber Signed-off-by: Liu Yuan commit c70ac30a9ef797207b776cbe751387f290f4357b Author: Jens Weber Date: Thu Jun 28 12:11:01 2012 +0800 man: first version of collie.8 manpage. [ Modify some content in place - Liu Yuan ] Signed-off-by: Jens Weber Signed-off-by: Liu Yuan commit 08876d15f38aba52e032ee8f3c363c6e36d84cc2 Author: Jens Weber Date: Thu Jun 28 12:08:13 2012 +0800 man: add new option to sheep.8 manpage. [Change some typo in place and remove async option - Liu Yuan ] Signed-off-by: Jens Weber Signed-off-by: Liu Yuan commit 2b4ad1e0de8289ce7c28256a18e3d8f25614a701 Author: Liu Yuan Date: Thu Jun 28 00:28:35 2012 +0800 sheep: make forward_{read,write}_obj_req() static We no longer call it outside gateway.c, so we'd better make it static and prohibit it from being called outside because they doesn't has retry support. Developers are suggested to call exec_local_req() instead Signed-off-by: Liu Yuan commit c47ad64a663d792e33730c6cac414aa7842ed29d Author: Liu Yuan Date: Thu Jun 28 00:28:34 2012 +0800 object cache: remove async flush For now we have a retry-based pull/push method, which privide us mush better robust mechanism, async is no longer required to run Guests seriously because it violates block storage that guests expect. Signed-off-by: Liu Yuan commit 33f98cbf723d96aed2a105ab757d6b943cd9251d Author: Liu Yuan Date: Wed Jun 27 21:25:34 2012 +0800 object cache: use exec_local_req() for pull method Then we can benefit its retry mechanism. commit c563fafd6051973ee2fdc03bb6f1a36a29821288 Author: Liu Yuan Date: Wed Jun 27 20:50:18 2012 +0800 sheep: fix/modify help message of sheep,collie,sheepfs Signed-off-by: Liu Yuan commit fc2b8cc18e61554a9791d125939f72489d598fdc Author: Liu Yuan Date: Wed Jun 27 17:31:32 2012 +0800 sockfd cache: group idx and fd as as a struct - We get a wew APIs for manipulation of sockfd: struct sockfd *sheep_get_sockfd(struct node_id *); void sheep_put_sockfd(struct node_id *, struct sockfd *); void sheep_del_sockfd(struct node_id *, struct sockfd *); - change write_info accordingly - add a sub-structure - add a new pfd_info for poll system call Signed-off-by: Liu Yuan commit 349dab673579e21b009b5677ebc5b80377fca249 Author: Liu Yuan Date: Wed Jun 27 17:19:24 2012 +0800 sheep: use node_id_cmp to compare node - remove vnode_node_cmp(), node_cmp() Signed-off-by: Liu Yuan commit ca95c960bbd30ba3cda2a502e32376a7802ecb83 Author: Liu Yuan Date: Wed Jun 27 17:19:24 2012 +0800 sheep: embed struct node_id into struct {sd_node, sd_vnode} Signed-off-by: Liu Yuan commit fcb6c6e0d7aa2abee16cdf9e1e6a69a8b6fbdece Author: Liu Yuan Date: Wed Jun 27 17:19:24 2012 +0800 sheep: try best forward_read_obj_req() We should continue to read if one read fails, since we have strong consistency Signed-off-by: Liu Yuan commit d12fcdba4b9ed58c279c69f63c6e8104d79f3cff Author: Liu Yuan Date: Wed Jun 27 17:19:24 2012 +0800 sheep: output hex for error code Also we ask do_read() to dprintf 'ret' value instead of blindly print %m, which looks wierd when remote node crashes (ret = 0) commit d47ae0f0043100fad9d171f27fd01c845f801da2 Author: Liu Yuan Date: Wed Jun 27 17:19:24 2012 +0800 sheep: refactor forward_write_obj_req() commit b93ca9da0baf645247b04161d303359cc93ffbdb Author: Liu Yuan Date: Wed Jun 27 17:19:22 2012 +0800 sheep, sockfd cache: cache more than one FD for each node This is inspired by the observation that each Guest can issue as much as 4 requests in one go. The complexity added to the code is seen outside of sockfd cache: add one more parameter to the API: FD index. The underlying core needs this to identify which FD belongs one node is actually used. I think this trade-off is a good deal. Signed-off-by: Liu Yuan commit a893b8661891475fd142c4fa3c0f2a8ffafb109d Author: Liu Yuan Date: Wed Jun 27 16:20:53 2012 +0800 sheep: redesign a new cached sockfd pool Old sockfd pool has following defect: 0 statically allocated. 1 use too many fds per sheep, not scalable 2 implemented per thread, can't be shared between threads 3 need resetting at every membership change The new sockfd cache aims to address these problems yet remain as effecient as old one: 0 dynamically allocated/deallocated at node granularity. 1 cached fds are multiplexed by all threads. 2 each session (for e.g, forward_write_obj_req) can grab one fd at a time 3 if there isn't any FD available from cache, use normal connect_to() and close() internally 4 FD are named by IP:PORT uniquely, hence no need of resetting at membership change 5 the total number of FDs shrinks from (nr_gateway + nr_io) * nr_nodes to nr_nodes 6 just add one more API, totally 3 APIs: sheep_{get,put,del}_fd() Signed-off-by: Liu Yuan commit 0b85c9f88097db6ad80287ac553ed045436581b6 Author: levin li Date: Wed Jun 27 11:07:48 2012 +0800 sheep: cleanup the api of read/write/remove_object() and push_cache_object() Since we use a local gateway request to replace the old forward_*_obj_req method, it's no need to keep vnode_info and epoch in this api. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 2c74378f0822bb3312d92dd4168f14628993bbb8 Author: levin li Date: Wed Jun 27 11:07:47 2012 +0800 sheep: make push_cache_object() queue a local gateway request The same reason as write/read/remove_object(), push_cache_object() also needs a retrying machanism like gateway, so we also queue a local gateway request instead of directly calling forward_write_obj_req(). Signed-off-by: levin li Signed-off-by: Liu Yuan commit 84cb024ae2d0fba1c82c2964356d5876d1680b58 Author: levin li Date: Wed Jun 27 11:07:46 2012 +0800 sheep: queue local gateway request instead of directly call forward_*_obj_req In read/write/remove_object(), we directly call forward_*_obj_req() to forward the request to peer nodes, but without any retrying machanism as the gateway does, so we should queue a local gateway request for this routine to make it take advantage of the retrying machanism of gateway. Signed-off-by: levin li Signed-off-by: Liu Yuan commit d392f0a2c1b5dd668cc56902100384a4034b5dac Author: levin li Date: Wed Jun 27 10:05:33 2012 +0800 sheep: remove delete_error from vdi deletion_work Since we've made gateway to retry when removing an object fail during recovery, the most possible case that make remove_object fail is that we meet an EIO, in which case even if we try to delete again the error still exist, so delete_error is no longer needed. Signed-off-by: levin li Signed-off-by: Liu Yuan commit f46fa4707f2f209bc9e4c8e1d025af0bf9f7dd7b Author: Yunkai Zhang Date: Wed Jun 27 09:54:48 2012 +0800 sheep: initiative to send leave event to cluster when shutdown Sheep will stop itself after received collie shutdown CMD, but zookeeper server can't detect this leaving action until session timeout(30 seconds at this time), as a result we have to wait a few seconds before we can restart sheep. This is very annoying. Calling leave_cluster() before sheep exit can solve this problem. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit c7fb4c8b36ef9c0c72c6192d9ff504146287464c Author: Christoph Hellwig Date: Tue Jun 26 17:01:57 2012 -0400 remove the unused SD_MAX_VMS define Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 01f8f7dbb2588d749d0403c9296b67f52d27a350 Author: Christoph Hellwig Date: Tue Jun 26 06:24:23 2012 -0400 sheep: remove unused nodes/nr_nodes fields in struct cluster_info Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 79d612686bc9835255c6e706dd677ec8ea6a64dd Author: levin li Date: Tue Jun 26 17:27:01 2012 +0800 sheep: fix a bug of segment fault caused by unintialized list_head client_info.conn.blocking_siblings is initialized with INIT_LIST_HEAD, but we check whether it's empty as a list head in client_handler, with an uninitialized list_head, it may not empty, so list_del gives a segment fault. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 2a8dd9598df706da8d87b46b033a315d192a3100 Author: Christoph Hellwig Date: Tue Jun 26 06:25:36 2012 -0400 sheep: remove SD_OP_SHEEP Remove the unused placeholder and replace it with a comment explaining what the deal with 0x80 or higher opcode is. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 65074a1523964c9375fcf4c3d4b9f97074924028 Author: Liu Yuan Date: Tue Jun 26 20:10:46 2012 +0800 objlist cache: init lock variable Signed-off-by: Liu Yuan commit 1061e219b71a609c7df71ed693b1336c61e940ba Author: Yunkai Zhang Date: Sat Jun 23 21:40:35 2012 +0800 sheep: fix bug that blocking_conn_list could never be processed In client_tx_handler(), when the program goto again, ci->tx_req will be re-initialized, so it's value would be always not NULL, and then sys->blocking_conn_list could never be processed. We sould not call init_tx_hdr(), when goto again. BTW: adds some code to help us debuging. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 6575fdf332276939f006c1624359957558174b63 Author: Yunkai Zhang Date: Mon Jun 25 13:20:45 2012 +0800 sheep: take truns to monitor EPOLLIN and EPOLLOUT events. When both EPOLLIN and EPOLLOUT events occur, client_hander() will always process EPOLLIN event firstly, but this is not reasonable in some case. For example: Client Server | | | monitor(EPOLLIN) | | write() >- send request pkg -> | | read() | | | [1]: monitor(EPOLLIN and EPOLLOUT) | | | <- send response pkg(1/2) -< write() | | | EPOLLIN occur [2]: read() [1]: After the server read the request pkg from a client, it will monitor both EPOLLIN and EPOLLOUT events. Then the server is going to write response pkg to the client, but only 1/2 data was send to the client for network busy. [2]: Before all of the response pkg was send to the client, EPOLLIN occur, then the server will call read() to this fd. In this case, the respone pkg could not be send to the client as soon as quickly. then the server has not chance to send the response pkg completly, it may lead to dead lock. The simplest way to fix this problem is that, let sheep take truns to monitor EPOLLIN and EPOLLIUT events. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 3e021e9292a7493649e704429978d5a3b3efe4df Author: Yunkai Zhang Date: Sat Jun 23 21:40:33 2012 +0800 sheep: fix bug when delete conn.blocking_siblings We found that in some case, sheep will crash when going to delete conn.blocking_siblings, because this list is empty. So we should use list_empty() to check it safely. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 0712bc2595a3d94447767eb70e7b303e9072e011 Author: Yunkai Zhang Date: Sat Jun 23 21:40:32 2012 +0800 sheep: priority to process EPOLLERR/EPOLLHUP event When EPOLLERR/EPOLLHUP events occur, other events(eg: EPOLLIN) may be inclued at the same time, but we should priority to process them as the connection was destroyed. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 2fcb160d3a0df5071e853755c7be91750a4e8927 Author: levin li Date: Sun Jun 24 20:13:28 2012 +0800 sheep: avoid calling delete_inode() in main thread When deleting vdi, delete_one_done() calls delete_inode() which do IO request (read and write), while delete_one_done() is called in main thread, so we should not call delete_inode() in delete_one_done(). Signed-off-by: levin li Signed-off-by: Liu Yuan commit 43cac494895ebd23a3fcd059d013573b75a67136 Author: levin li Date: Sun Jun 24 16:32:23 2012 +0800 recovery: fix a bug of get_vnode_info_epoch() which gives a bad node number When local epoch file does not exist, sheep tries to read from remote node by epoch_log_read_remote(), which directly returns the number of node, not the length of node data. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 47bf21bbcbe1d2ba6c03eba189643c6a63730d91 Author: Liu Yuan Date: Sun Jun 24 19:34:36 2012 +0800 sheep: fix local_get_epoch() We should also add the piggybacked ctime to the length Signed-off-by: Liu Yuan commit 524f01378fa8a03545d41c7d7822707bb08f2e79 Author: Liu Yuan Date: Sun Jun 24 19:34:34 2012 +0800 sheep: use xwrite to avoid short write Also add one liner to emphasis that we piggyback the ctime onto epoch file Signed-off-by: Liu Yuan commit 37aece35ebaa0fdd7e7dae116031c3ae78466c6c Author: Liu Yuan Date: Thu Jun 21 14:35:25 2012 +0800 collie: fix vdi check command - use memcmp directly to cmpare the content of object - compare all the replica Signed-off-by: Liu Yuan commit 2361852f10f70df7d011b5577c01150c0ddc328e Author: Liu Yuan Date: Wed Jun 20 19:58:36 2012 +0800 sheep: remove redundant dprintf in do_local_io() Both do_io_request and do_gateway_request already printf the info as follows: ... Jun 20 19:43:03 [main] queue_request(312) 2 Jun 20 19:43:03 [io 3] do_io_request(105) 2, fd34af00000000 , 1 Jun 20 19:43:03 [io 3] do_local_io(52) 2, fd34af00000000 , 1 So remove it. Signed-off-by: Liu Yuan commit d401404972d0fa47800dec4b55a65cc17ef0d99f Author: Liu Yuan Date: Wed Jun 20 19:21:09 2012 +0800 collie: clean up sheep's state variables - prefix them all with SD_ Signed-off-by: Liu Yuan commit b5e026e1b01b72fd1c1d61f0c404e023d6d9ccc7 Author: Liu Yuan Date: Wed Jun 20 19:21:05 2012 +0800 collie: remove unused lines Signed-off-by: Liu Yuan commit 73f608048cee4c6202e0d33ec007c74d65b468f1 Author: Liu Yuan Date: Wed Jun 20 18:24:52 2012 +0800 collie: add a check&repair command Quote Kazutaka's comment about object consistency: "With the following scenarios, object replicas could have the different contents: - a gateway node fails while forwarding write requests - total node failure happens while writing objects In the such cases, it is okay for VMs not to read the latest data from the inconsistent objects because the VMs received EIO from them before. However, it is still needed to fix the objects' inconsistency so that the VMs won't read the different data from the objects next time." So when those two case happens, uesrs are expected to run: $ collie vdi check affected_vdi_name Signed-off-by: Liu Yuan commit ff363f3795b52dc92f0e3157f7677fa58e4b41f1 Author: Liu Yuan Date: Wed Jun 20 18:24:46 2012 +0800 sheep: remove fix_object_consistency() Jun 08 14:32:42 queue_request(387) 1 Jun 08 14:32:42 do_io_request(105) 1, dc4435000011be , 2 Jun 08 14:32:42 do_local_io(52) 1, dc4435000011be , 2 Jun 08 14:32:42 client_rx_handler(588) connection from: 10.0.1.62:48551 Jun 08 14:32:42 queue_request(387) 1 Jun 08 14:32:42 do_io_request(105) 1, dc4435000011be , 2 Jun 08 14:32:42 do_local_io(52) 1, dc4435000011be , 2 Jun 08 14:32:42 client_rx_handler(588) connection from: 10.0.1.62:48552 Jun 08 14:32:42 queue_request(387) 1 Jun 08 14:32:42 do_io_request(105) 1, dc4435000011be , 2 Jun 08 14:32:42 do_local_io(52) 1, dc4435000011be , 2 Jun 08 14:32:42 client_rx_handler(588) connection from: 10.0.1.62:48549 Jun 08 14:32:42 queue_request(387) 1 Jun 08 14:32:42 do_io_request(105) 1, dc4435000011be , 2 Jun 08 14:32:42 client_rx_handler(588) connection from: 10.0.1.62:48550 Jun 08 14:32:42 do_local_io(52) 1, dc4435000011be , 2 Jun 08 14:32:42 queue_request(387) 2 Jun 08 14:32:42 do_io_request(105) 2, dc4435000011be , 2 Jun 08 14:32:42 do_local_io(52) 2, dc4435000011be , 2 Jun 08 14:32:42 do_io_request(111) failed: 2, dc4435000011be , 2, 3 Jun 08 14:32:42 io_op_done(119) leaving sheepdog cluster Jun 08 14:32:42 client_rx_handler(588) connection from: 10.0.1.62:48551 fix_object_consistency() might be called in multiple threads and cause trouble. So we'd remove it from the sheep core and add a manual check&repair in collie. Signed-off-by: Liu Yuan commit 4ea1f67131b4848b5200ccf13be3d5e638d3f39c Author: Liu Yuan Date: Wed Jun 20 10:59:44 2012 +0800 sheep: don't call epoch_log_read() if epoch is 0 For a fresh node joining, the latest epoch of itself is 0, so we simply return in this case or we'll meet an error message from epoch_log_read(). Signed-off-by: Liu Yuan commit 8d19f6cde18a7bfed98073d21d820c714f88bcdf Author: Liu Yuan Date: Wed Jun 20 10:59:44 2012 +0800 sheep: change int epoch -> uint32_t epoch Signed-off-by: Liu Yuan commit 07f7fb51e21444dfce7a512eef63617051ea3cd1 Author: Liu Yuan Date: Wed Jun 20 10:59:44 2012 +0800 sheep: fix nested requests for the same FD Consider following flow: forward_write_obj_req ->get_sheep_fd() <---384 fd ->send_req() <--req 1 ->do_local_io() ->store_create_and_write_obj() ->read_object() ->forward_read_obj_req() ->get_sheep_fd() <---384 fd ->exec_req() ->send_req() <--req 2 ->do_read() <--read response of req 1 This will allow second do_read() read the wrong response. Signed-off-by: Liu Yuan commit 54c6c89a4229a6edc305ab8d1ccd383ef709ec04 Author: Liu Yuan Date: Wed Jun 20 10:59:44 2012 +0800 farm: add a lock for read/write We observed a race for this same object, which resule an EIO: Jun 14 16:16:12 queue_request(387) 1 Jun 14 16:16:12 do_local_io(52) 1, ac1a3e00000000 , 1 Jun 14 16:16:12 listen_handler(805) accepted a new connection: 191 Jun 14 16:16:12 client_rx_handler(588) connection from: 10.232.134.8:42092 Jun 14 16:16:12 queue_request(387) 2 Jun 14 16:16:12 do_local_io(52) 2, ac1a3e00000000 , 1 Jun 14 16:16:12 do_io_request(111) failed: 2, ac1a3e00000000 , 1, 3 Jun 14 16:16:12 io_op_done(119) leaving sheepdog cluster So let's be more denfensive now. Signed-off-by: Liu Yuan commit 1c05a609e6059c1e13223fd926dd9241cca758c1 Author: Liu Yuan Date: Wed Jun 20 10:59:44 2012 +0800 object cache: remove assertion Assertion is too destructive and we actually met this panic out. We can switch to a more conservative code which does no harm. Signed-off-by: Liu Yuan commit 1343a0a6af79701c76e46c2fbc5fdccce34107bc Author: Liu Yuan Date: Wed Jun 20 10:59:44 2012 +0800 object cache: fix object creation failure When the flush request come right after create request, currently we'll push nothing back because we don't mark the object dirty at all. We really should do this because we actually want to push back the whole object for creation in backend. Signed-off-by: Liu Yuan commit 5d99cfebe5b94967675a99536d95d5a12c266d37 Author: Liu Yuan Date: Wed Jun 20 10:59:39 2012 +0800 sheep: mute gcc warning about uninitialized var Add a macro to mute below whining: ... work.c:84: warning: ‘idx’ may be used uninitialized in this function Signed-off-by: Liu Yuan commit dcff89dfe3c3647a72f224256b9ccab09c1497f6 Author: MORITA Kazutaka Date: Tue Jun 19 06:27:45 2012 +0900 logger: make thread id more human readable Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit e46513244d94efb35c290246c06fdb3e8535e133 Author: Yunkai Zhang Date: Sun Jun 17 03:00:28 2012 +0800 sheep: print thread id in logfile which could help debuging Main thread's id is 0, other thread's id will be [1..nr_thread]. The output format looks like these: Jun 17 02:30:31|0|queue_request(387) 3 Jun 17 02:30:31|1|do_gateway_request(280) 3, 80a5d05d00000000 , 1 ^ | --- Here is thread id Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit be216b8eab8d3245c76da7b42c12d909cc04ce08 Author: Yunkai Zhang Date: Sun Jun 17 03:00:27 2012 +0800 sheep: fix strlen in log_enqueue Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 9571eebd57e14fc86d33421f1297c7f73d19e924 Author: Christoph Hellwig Date: Fri Jun 15 11:08:26 2012 -0400 sheep: remove WORK_ORDERED and related code This flag is entirely unused, and just complicates the workqueue code. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 0d419fb46bb8503e715656ede479f882449e7eab Author: Christoph Hellwig Date: Fri Jun 15 11:21:15 2012 -0400 sheep: dont rewrite epoch log for the epoch before a node joins Due to the way the cluster status and inc_epoch works this can get us different historic epochs for different sheep in a cluster. Instead create node list for the previous generation in memory if we need to. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 50927ddc4326e47416dc3a2674d926719bbac41d Author: Christoph Hellwig Date: Fri Jun 15 11:21:14 2012 -0400 sheep: pass vnode info to end_recovery This allows farm to reuse the old epoch node list instead of re-reading it. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit a5d86a98a256a9dd73d43550852fb2754516bbd6 Author: Christoph Hellwig Date: Fri Jun 15 11:21:13 2012 -0400 farm: pass a node list to snap_file_write This will allow passing in a node list from recovery later. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 2d39310898d1b835df56278fa07b4e5c4a576d7a Author: Christoph Hellwig Date: Fri Jun 15 11:21:12 2012 -0400 sheep: handle new nodes during WAIT_FOR_JOIN state There is no problem letting new nodes join a cluster in WAIT_FOR_JOIN state as long as they aren't counted towards the quorum of nodes requires to restart the cluster. For this repurpose the CJ_RES_JOIN_LATER return value that is currently equivalent to CJ_RES_FAIL to indicate we have a node that could join, but wasn't part of the cluster before shutdown. A new nr_delayed_nodes counter and list of delayed nodes is also added to the join message to make sure all nodes know about this kind of delayed node. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit b9bbf1dc9019c2e428b669de5c22e5a9205dbadf Author: Christoph Hellwig Date: Fri Jun 15 11:21:11 2012 -0400 sheep: handle CJ_RES_MASTER_TRANSFER inside cluster_wait_for_join_check Instead of fixing up the return value down in sd_check_join_cb handle this special case directly inside cluster_wait_for_join_check. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 2b8cd3b56072ca97ebc8f3dbb5cb09941c9a0f4d Author: Christoph Hellwig Date: Fri Jun 15 11:21:10 2012 -0400 sheep: refactor get_cluster_status Redo the way get_cluster_status and cluster_sanity_check works. The new get_cluster_status is a switch statement over all possible system states, which then branches out into helper functions for states where the checks are non-trivial. This makes the intention of the checks a lot more obvious. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 54652d8d12ccf7374a4d8d33389b8f4d01fb6d64 Author: Christoph Hellwig Date: Fri Jun 15 11:21:09 2012 -0400 sheep: remove SD_RES_JOIN_FAILED We never set this status, so remove it. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit c46b59d55107e87bab8424c2a625ba949726e3f3 Author: Christoph Hellwig Date: Fri Jun 15 11:21:08 2012 -0400 sheep: return CJ_ values directly from cluster_sanity_check Don't return SD_RES_ values from cluster_sanity_check first and then translate them to the CJ_ values later. This also shows that the result field in struct join_message wasn't needed and can be removed entirely. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit fcd557004874ba78e03b54b6dcd8b1246edc8cac Author: Christoph Hellwig Date: Fri Jun 15 11:21:07 2012 -0400 sheep: factor leave_list processing Factor handling of the leave list into a few helpers to make it more obvious. Also rename the leave name component to failed/failed_nodes to make the use case more obvious. Finally, remove the nodes/leave_nodes union in favor of a single array, as an union of two fields with the same type is fairly confusing. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 48d4a977fb6073abb536d95f40a6bd8afc94ef7b Author: Christoph Hellwig Date: Tue Jun 5 08:07:34 2012 -0400 sheep: simplify the cluster driver interface for blocking events Let sd_block_handler handle the fine details of how to handle an incoming blocking event. By passing the sender node structure we can easily handle ignoring it on other nodes, and by keeping a local operation in progress flag in group.c we can replace the callbacked flag in the on the wire events with a way simpler mechanism. The only slightly complicated bit is that zk_notify_blocked in the zookeeper backend can now go negative for a short period of time, so we explicitly have to check for it beeing positive in two places. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 6e20de84cb91e32ebf854697e37cf53de4262d79 Author: Yunkai Zhang Date: Wed Jun 13 11:38:33 2012 +0800 sheep: set keepalive in server-side socket fd It's important to set keepalive option in server-side socket fd, this can help the server to clear deaded connection quickly when its client crashed for unexpected reason (eg: client reboot or be killed). Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit d6eb076dfd30cec0de88fc3eda304c0ba7acc2c0 Author: Yunkai Zhang Date: Wed Jun 13 11:38:32 2012 +0800 sheep: set keepalive in all client socket fd We have set keepalive in cached socket fd, but uncached fd should also need to set this option. So I move set_keepalive() from get_sheep_fd() to connect_to(). Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 316f3dad66fe23d9fea87981287c4d70dfa55b40 Author: Liu Yuan Date: Thu Jun 14 16:19:14 2012 +0800 object cache: add a helper for oid_to_idx() Signed-off-by: Liu Yuan commit 9366222849031a42a9b06b980a3c54abd4397b99 Author: Liu Yuan Date: Thu Jun 14 16:19:13 2012 +0800 sheep, store: remove oid from farm state Add a new callback named 'remove_object' to properly do store specific opertaions. This let farm remove oid from trunk active list to avoid a sha1_file_write fail later on. Signed-off-by: Liu Yuan commit af7c2a9305cd01001a2edcdcdd54df17cd971abb Author: Liu Yuan Date: Thu Jun 14 16:19:11 2012 +0800 sheep: remove oid from object cache internal list and tree We should remove it from object cache internal state too otherwise flush will return ERROR by reading a non-existent object later on Signed-off-by: Liu Yuan commit 2b9d6f41a850165868e7cc69b0bd973c43994c5e Author: Liu Yuan Date: Wed Jun 13 11:13:01 2012 +0800 object cache: opencode add_to_dirty_tree_and_list() This functoin becomes very convoluted after more and more parameters added in. It actually serves different purpse by different callers, open-code it simplify code a lot. Signed-off-by: Liu Yuan commit 564d3f53bae1824bb994ac13d68a10e89bdb782a Author: Liu Yuan Date: Wed Jun 13 11:05:30 2012 +0800 farm: add two more debug info We still meet the problem and seems that the last patch overlook these two places. Signed-off-by: Liu Yuan commit f645906bc6b6f5837a21189e0c0a799cff79215d Author: Liu Yuan Date: Wed Jun 13 10:43:19 2012 +0800 sheep, recovery: fix screen_object_list() for duplicate oid This bug was a regression by a patch that accidentally remove this qsort() Signed-off-by: Liu Yuan commit 95349321fe81f07e4757b3faeaa525277d4c4c9a Author: Liu Yuan Date: Wed Jun 13 10:43:09 2012 +0800 sheep, recovery: fix wrong size for xrealloc() - also change memmove -> memcpy - add a defensive warning Signed-off-by: Liu Yuan commit 952b1496803681a5201c3e9fb1e6e1aace831b73 Author: Liu Yuan Date: Wed Jun 13 10:43:01 2012 +0800 farm: add defensive debug info We observed a bug emitting the line "write sha1 object fail", which doesn't give why it fails, so add more debug info to assist analysis Signed-off-by: Liu Yuan commit 522d5c2efaaaeb9d914408a4c7eac53301eecde1 Author: Liu Yuan Date: Mon Jun 11 10:08:01 2012 +0800 object cache: fix indentation Signed-off-by: Liu Yuan commit ad3921e8a84669fe93211e78305bb2aa64bea2eb Author: Liu Yuan Date: Mon Jun 11 10:07:57 2012 +0800 object cache: fix wrong bmap calculation and flush offset Signed-off-by: Liu Yuan commit 0c88523473bc99d6654057559cae540d0c1c9799 Author: Liu Yuan Date: Fri Jun 8 13:54:19 2012 +0800 sheep: change one-shot timeout to keepalive The timeout of 5s really cause trouble from our observation, we see a lot of timeout failure when cluster is doing IO heavily. Use keepalive means we don't fail-timeout until the other end of node is really down. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit b666cf81d0e707cb518f98be48f023dbaf8c0edd Author: MORITA Kazutaka Date: Fri Jun 8 03:12:39 2012 +0900 object_cache: fix uninitialized value work structure in struct flush_work must be initialized with zero so that work->attr is WORK_SIMPLE. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit b387f0883416252b7cbb44eec5142919f4a7d3b9 Author: Liu Yuan Date: Thu Jun 7 17:29:35 2012 +0800 sheep: fix forward_write_obj_req() This patch addresses one very sutble problem, to quote from Kazutaka: One possibility is that if forward_write_obj_req() fails before receiving data, the next forward_(read|write)_obj_req() could be interleaved. The interleaved requets will return random res->result and sometimes more catastrophic, EIO to upper layer and askes the node to leave the cluster. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 3582eef303f4df341c5b1d574fd4808ecb331017 Author: Christoph Hellwig Date: Thu Jun 7 17:15:41 2012 +0800 sheep: add an option to override the advertised Allow users to override the address advertised to other sheep. This is important for setups where the computers running sheep nodes have multiple network interfaces and we need to use a specific one. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 239a6ec7d8916688ea52fafd18f8a2d00504ba05 Author: Christoph Hellwig Date: Wed Jun 6 18:23:23 2012 -0400 sheep: remove read_epoch By opencoding it in the two callers we can not only simplify the code, but also differenciate the nr_nodes = 0 case where we don't want to read the epoch log from a real error reading the epoch log. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit e55458492771c99f4042c6008ab3ad036e9c1640 Author: Christoph Hellwig Date: Wed Jun 6 18:23:22 2012 -0400 From: Christoph Hellwig Suject sheep: cleanup epoch_log_read and friends Pass a struct sd_node array instead of an unformatted buffer to all epoch_log_read variants, and cut down the epoch_log_read/epoch_log_read_nr split down to a single variant, which returns the number of nodes, but is called epoch_log_read. Also make epoch_log_read_remote return the number of nodes, as that's what most callers want. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 4026c3333b9279f8aa16c70199f9056cfed0fce4 Author: Christoph Hellwig Date: Wed Jun 6 08:49:10 2012 -0400 sheep: fix nr_nodes calculation in local_stat_cluster Pass the actual node number to get_max_nr_copies_from, instead of the size of the nodes array. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 0ae0add5f17c74958a378886881425400156097d Author: Christoph Hellwig Date: Wed Jun 6 18:23:21 2012 -0400 sheep: handle epoch_log_read errors in snap_file_write Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit e6cbee247f3a6bcab581ee6e794dfdc257b639b5 Author: Christoph Hellwig Date: Wed Jun 6 08:38:59 2012 -0400 sheep: split getting the local address from cluster driver ->init Getting a suitable address to advertise to other sheep is substancially different functionality from initializing the cluster driver. Split it into a separate optional method that falls back to the getifaddrs loop if not specified. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit f95cad58d85367e1ba7c6d29213a777616481478 Author: Christoph Hellwig Date: Wed Jun 6 06:36:54 2012 -0400 sheep: use getifaddrs in get_local_addr The combination of gethostname and getnameinfo does not seem to work very well to find an IP address for a system that doesn't seem have a host name, or for one that has IPv6 configured in the kernel without actually using it. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 2e8abd93d0ca56429a30a3637fbd7711301dd049 Author: Christoph Hellwig Date: Wed Jun 6 19:16:02 2012 +0800 sheep: collie cluster recover needs to start recovery Currenly we can easily get into a situation where we can't read objects after losing a node in an offline cluster and then doing a manual recovery. To fix this call start_recovery from cluster_manual_recover. Also move get_vnodes_from_epoch into group.c and rename it to fit with the rest of the vnode_info functions now that is is used outside of recovery.c. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit c7df679805c197401ef9af2a83dfdc29030767b5 Author: Christoph Hellwig Date: Tue Jun 5 10:55:51 2012 -0400 sheep: fix error handling in epoch_log_remote Make sure we exit if either get_latest_epoch or epoch_log_read fails. The get_latest_epoch failure case might not be obvious, but given that local_stat_cluster is a a force operation we might not have any epoch log on thise node by the time it is called. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 9be10c53a13bef3a8d4a2b47025ca554fac8a927 Author: Christoph Hellwig Date: Tue Jun 5 18:32:16 2012 -0400 sheep: move get_addr into lib/net.c Instead of duplicating this routine between the accord and zookeeper drivers move it to lib/net.c as it is generic networking releated code. Also rename the function to get_local_addr to be a bit more descriptive. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit b308f2f09b9fd023554ac65ffb8f3fb559a7a777 Author: Christoph Hellwig Date: Tue Jun 5 08:32:56 2012 -0400 sheep: do not return a fd from cluster driver init Now that all event handling is inside the cluster drivers we can just return 0 from ->init instead of the file descriptor. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 2e95160daa813486b8047098f231e4cafe4d99c2 Author: Liu Yuan Date: Tue Jun 5 19:37:28 2012 +0800 sheep, recovery: fix memory leak We should also free prio_oids Signed-off-by: Liu Yuan commit 913cc94505b811c7b1b4e28718894db7c08fcd88 Author: Liu Yuan Date: Mon Jun 4 01:09:48 2012 +0800 sheep: fix oid scheduling in recovery It is not thread safe to manipulate rw->oids[] both in main and worker threads. Add a rw->prio_oids[] and let rw->oids[] handling be safe in recover_object_main(), which means no recover_object_work is being executed meanwhile. This also fix a nasty buffer overflow in rw->oids[], which did an insane memmove and clean up code a bit. Signed-off-by: Liu Yuan commit 044d4cc502fc16b58b125199f5595c4554fbbb2e Author: MORITA Kazutaka Date: Mon Jun 4 17:57:21 2012 +0900 sheep: set error result correctly in forward_write_obj Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 72440786ca335fdd1a4976b31064cb28c3d53260 Author: Liu Yuan Date: Mon Jun 4 10:41:32 2012 +0800 farm: drop fcntl lock For a second thought, even QEMU can issue multiple requests to the same object we are guaranteed that those requests don't overlap on each other. Recovery code and IO code might access the same object, but the race is excluded by epoch so we don't need per-object lock at all for now. Signed-off-by: Liu Yuan commit e5331ea2e723d32d44b76ef695064ad69e3b148d Author: Liu Yuan Date: Mon Jun 4 10:32:59 2012 +0800 object cache: use flock instead of fcntl for locking To quote manual: "record locks are automatically released when the process terminates or if it closes any file descriptor referring to a file on which locks are held" This means fcntl lock doesn't support locking across FD even in the same process space. Signed-off-by: Liu Yuan commit e8949ee7767f894588f328dff1a00d828365f095 Author: Liu Yuan Date: Sun Jun 3 14:47:17 2012 +0800 sheep: fix finish_object_list() When prepare_object_list() gets an empty list, we should directly call finish_recovery() or oid = 0 will be recovered. Though this does no harm just wasting cpu cycles, we should avoid it completely. Signed-off-by: Liu Yuan commit a532e44d80c3888a3e62655f28770ab77a9ae656 Author: Liu Yuan Date: Sun Jun 3 14:44:09 2012 +0800 sheep: move oid scheduling code into a helper function - rename is_recoverying_oid() -> oid_in_recovery() Signed-off-by: Liu Yuan commit 04a279f4514ab4da453ced0a3ef83cde5783bd32 Author: Liu Yuan Date: Sun Jun 3 14:43:01 2012 +0800 sheep: opencode fill_obj_list() into prepare_object_list() - cleanup the code a bit - better consistent naming - screen_obj_list -> screen_object_list - request_obj_list -> fetch_object_list - add some comments Signed-off-by: Liu Yuan commit 958cd5785401d201244f51424bbdb029047cd57c Author: Liu Yuan Date: Sun Jun 3 14:36:45 2012 +0800 sheep: call list_del() before requeue_request(req) Since we don't call list_del() inside requeue_request() now, this fix dead-locks which always queue request. Signed-off-by: Liu Yuan commit b8aae208641a7a368b6a8bf27bacf54dbab2ca28 Author: Liu Yuan Date: Sun Jun 3 14:36:37 2012 +0800 sheep: refactor fill_obj_list() We have a normal connect_to(), so we don't need manually retry because if it failes, it means the target node is really down. - move next_rw check ahead Signed-off-by: Liu Yuan commit 57d778d2fb3899966c10961b25b42ae38f27262d Author: Liu Yuan Date: Sun Jun 3 14:36:10 2012 +0800 sheep: fix finish_object_list() if it jumps out of prepare_object_list() with next_rw set, we should really run_next_rw() instead of starting recovery phase. Signed-off-by: Liu Yuan commit 459a5fa03e05b83c305a1f73d13b79ee759f54a3 Author: Liu Yuan Date: Sun Jun 3 14:36:00 2012 +0800 sheep: remove print_node_list This is useless, since sd_join/leave_handler() already print node list. Signed-off-by: Liu Yuan commit ff87c12174a8f4b304fb09b3e04520a75ff8ad9d Author: Liu Yuan Date: Sun Jun 3 14:35:36 2012 +0800 sheep: fix queue_gateway_request() - we should check if object is cached on the first place - use goto to bypass other unnecessary checks. - remove unnecessay lines in need_consistency_check() Signed-off-by: Liu Yuan commit 55bb1820611809b2ced7edd4ea4f13c86eead139 Author: Liu Yuan Date: Sun Jun 3 14:35:24 2012 +0800 sheep: fix request_in_recovery() This actually revert the commit c2f92ddcd158cec243ea9e53a87e52b97cd61373 and add some comment to prevent us from doing wrong again. Signed-off-by: Liu Yuan commit 7ea2c3f398e8a04372777302c6d072ad12e0e483 Author: Christoph Hellwig Date: Sat Jun 2 11:39:09 2012 -0400 sheep: free the objlist cache entry in objlist_cache_rb_remove Without this we never free objlist entries. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit af38b7b2f0eb826202351781824eecedb00b2f4b Author: Liu Yuan Date: Sat Jun 2 22:36:40 2012 +0800 sheep: rename check_request_in_recovery() As Christoph suggests and modifies it to return bool value. Signed-off-by: Liu Yuan commit 037eeff5bab04425e2275d901a6f652522fa9ce6 Author: Christoph Hellwig Date: Sat Jun 2 10:04:29 2012 -0400 sheep: remove outstanding_req_list As a fallout we can use requeue_request in gateway_op_done now. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit cf5e815f6eef8a989ad5daf1b7b2b46405809c58 Author: Liu Yuan Date: Sat Jun 2 22:10:11 2012 +0800 sheep: fix wrong arg for resume_wait_obj_requests() Signed-off-by: Liu Yuan commit 3409c644bfb833a67d0bbcf0934c34c864469a22 Author: Liu Yuan Date: Sat Jun 2 22:10:11 2012 +0800 sheep: don't allow unauthorized node join when the cluster is in wait_for_join This fixes broken cluster_sanity_check() Signed-off-by: Liu Yuan commit 5f6b83a5a296a706a6875c8e441f546acef61690 Author: Liu Yuan Date: Sat Jun 2 22:10:09 2012 +0800 logger: allow sheep dump core file for non-root startup As a side effect, the coredump file will be located /path/to/store dir instead of previous '/'. We also don't overlap core file when multiple sheeps run on the same machine. Signed-off-by: Liu Yuan commit 750105b50af4b10d376ab0368420eaa6dc2c99ca Author: Liu Yuan Date: Fri Jun 1 23:21:14 2012 +0800 sheep: fix gateway_op_done() We should return after queueing it on wait_rw_queue. Signed-off-by: Liu Yuan commit 214717bc39e74ec40684a765dbafba30632a581e Author: Liu Yuan Date: Fri Jun 1 23:12:09 2012 +0800 sheep: add two assertion to clarify the code a bit This two cases are actually executed with two different context, adding two assertion both make code readable and safe. Signed-off-by: Liu Yuan commit c2f92ddcd158cec243ea9e53a87e52b97cd61373 Author: Liu Yuan Date: Fri Jun 1 21:09:38 2012 +0800 sheep: handle recovery request in check_request_in_recovery() We should call req_done() when is_recoverying_oid() returns true for recovery request, instead of going down to find nothing recovered. Signed-off-by: Liu Yuan commit cdd699f7fc7e9a88c29ab34ebf552054ebebd789 Author: Liu Yuan Date: Fri Jun 1 17:25:55 2012 +0800 sheep: remove resume_recovery_work() When the requested object is in sys->outstanding_req_list, the object is already being recovered, so we don't need check this busy status and should proceed. Signed-off-by: Liu Yuan commit be41628f98987c64fb694cb74c1addb998419c30 Author: Liu Yuan Date: Fri Jun 1 17:25:50 2012 +0800 sheep: refactor recovery.c - split object_list handling from object recovering process - factor recover_object_main Signed-off-by: Liu Yuan commit 070620b69e4c3d5c42a92753e663b871196662ee Author: Liu Yuan Date: Fri Jun 1 17:25:46 2012 +0800 sheep: rename check_request_busy() It only checks if the targeted oid is in recovery, so better rename it to reflect the change. Signed-off-by: Liu Yuan commit 936d07257e7a2ac3ee3e7c71778ee20b9447a549 Author: Liu Yuan Date: Fri Jun 1 17:25:02 2012 +0800 sheep: remove sys->req_wait_for_obj_list Since now farm has a per-object lock, thus support concurrent access to the same object, there is no need for sys->req_wait_for_obj_list Signed-off-by: Liu Yuan commit 54f4b920ab5fe1f2c97924754caa3e9868d49ba9 Author: Liu Yuan Date: Fri Jun 1 17:20:28 2012 +0800 farm: add concurrent access support to the same object Signed-off-by: Liu Yuan commit 9343d45ca971f2a858f4190e5728bf0b2781792d Author: Christoph Hellwig Date: Fri Jun 1 04:52:51 2012 -0400 sheep: fix SD_RES_OLD_NODE_VER handling in check_request_epoch Just complete the request with SD_RES_OLD_NODE_VER instead of trying to call the end_io handler to do that, which causes a list deletion while the request hasn't been added to any list. Signed-off-by: Liu Yuan commit 2858ba6185205c658fe4d1bd739dcd5d31b37c7b Author: Christoph Hellwig Date: Fri Jun 1 04:52:50 2012 -0400 sheep: split check_request Split check_request into helpers to check the request epoch, and to check if the request access a busy object. Peer I/O requests call both helpers, while gateway requests never need to call check_request_epoch because we just assigned the epoch to be the current one. Also lift the check for cached objects into queue_gateway_request and remove the now unused local_cow_oid field in struct request. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 65108aa8b64721e558f6dfc119af52eda871f87b Author: Christoph Hellwig Date: Fri Jun 1 04:52:49 2012 -0400 sheep: split io_op_done Use a trivial completion handler for peer I/O requests and a slightly more complicated one for gateway requests instead of interwinding the two. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 57b0c49df958c910bbbabf273ba25d181efc2086 Author: Christoph Hellwig Date: Fri Jun 1 04:52:48 2012 -0400 sheep: split queue_io_request Split queue_io_request into separate versions for gateway and peer requests, and merge the respective versions of setup_access_to_local_objects into their only callers. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit c93689f0ccdc5c96cf9952e59205a8d036de38d0 Author: Christoph Hellwig Date: Fri Jun 1 04:52:47 2012 -0400 sheep: merge process_io_request into queue_io_request Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 2c84a47731f764bc20bf5f4057a1e0133103c818 Author: Christoph Hellwig Date: Fri Jun 1 04:52:46 2012 -0400 sheep: use requeue_request to retry requests in io_op_done If we retry a request from io_op_done we currenly opencode the sequence in queue_request, except that the call to check_request is missing. Given that the epoch may change the call to check_request is nesscary and we should call queue_request in io_op_done as well. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit ade0c16061387014092715567ff314f8654f59d4 Author: Christoph Hellwig Date: Fri Jun 1 04:52:45 2012 -0400 sheep: dont resume pending requests or recovery when retrying If we have to retry a request that means it still blocks any other request or recovery item waiting for it, so calling resume_pending_requests or resume_recovery_work for this case is pointless. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit d0e15eb1a2b85cba34ecc3516b8b8648b76e5465 Author: Christoph Hellwig Date: Fri Jun 1 02:35:54 2012 -0400 zookepeer: fix fix is_blocking_event The join request is the blocking event, not the join response. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 06414cb12dffd0f32c843f3295439039d54327d8 Author: Christoph Hellwig Date: Fri Jun 1 02:37:37 2012 -0400 sheep: free msg in cluster_op_done cluster_op_done needs to free the message passed to ->unblock, just like we do for the non-blocking notifications to avoid a memory leak. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 15cc0b71f05f6039d75d4f1a70e9b11dcded44a1 Author: Christoph Hellwig Date: Thu May 31 11:04:26 2012 -0400 sheep: fix local_get_node_list before the node joined a cluster The GET_NODE_LIST command has the force flag and thus can be executed before the cluster has started, which includes the short time before the sheep even has any node list, in which we can trigger the assert for current_vnode_info in get_vnode_info. Fix this by checking for a NULL current_vnode_info first, for which we have to move the implementation to group.c. To me this makes sense anyway as the node list is a group.c internal detail to start with. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 82a199afbd39ed1e7967dabf720f9228030f2794 Author: Liu Yuan Date: Fri Jun 1 14:14:35 2012 +0800 Revert "sheep: remove the force flag from SD_OP_GET_NODE_LIST" This reverts commit 3fe64f2e578e4a50282feeec0a707b9b1fce9698. commit 3fe64f2e578e4a50282feeec0a707b9b1fce9698 Author: Christoph Hellwig Date: Thu May 31 12:11:58 2012 -0400 sheep: remove the force flag from SD_OP_GET_NODE_LIST Getting the node list when a sheep isn't fully up can trigger an assert in get_vnode_info. Given that there is very little use in getting the node list from a cluster that isn't operational yet simply remove the force flag. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit ebfb5a452c1265cc34129367af3cc75bc5280968 Author: Christoph Hellwig Date: Thu May 31 08:50:22 2012 -0400 sheep: do not use strbuf for screen_obj_list "sheep: add a helper function to copy out data from strbuf" added a new strbuf_copyout helper that NULL terminates strings copied out of a strbuf. For screen_objlist this leads to a off by one in the object list length and recovery failures in many test cases. Instead of simplify reverting that changes this patch changes screen_obj_list to use a more efficienly algorithm for calculating the list of oids to be replicated to the local node: - for each candidate OID we do a binary search on rw->oids for the number of elements before this call to screen_obj_list to replace the old call in merge_objlist. - then directly append the new oid to rw->oids and increment rw->count - after we are done processing the object list from one node we do a qsort pass over the oid list to make sure it will be sorted for the next iteration of screen_obj_list. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 6c280cd9b823f6681251e058db1cbfa4a7cddf67 Author: Liu Yuan Date: Thu May 31 20:56:21 2012 +0800 sheep: return NETWORK_ERROR when remove_object() fail to connect_to() Liu Yuan commit 4abd4166f7d0da096e3a0f7450033837bd897bf2 Author: Christoph Hellwig Date: Thu May 31 04:46:52 2012 -0400 sheep: move all object cache code into object_cache.c Move all code handling the object cache into object_cache.c, and thus allow keeping its implementation details private. Also add a missing sys->enable_write_cache for the SD_OP_FLUSH_DEL_CACHE command. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 899150260180ccab4edf45ee2ad2f98617bf7269 Author: Christoph Hellwig Date: Wed May 30 10:39:23 2012 -0400 sheep: pass the old and new vnode_info to recovery Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 97fd2de17184063b6235ee9636df0fe967e3cf66 Author: Christoph Hellwig Date: Tue May 29 05:36:42 2012 -0400 accord: do not overload event types Use different types for join requests vs responses, and block vs notify events intead of using the blocked field to overload the type. (compile tested only) Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit e429ab527755f5eb5b00fa43a195acd0a9380cf8 Author: Christoph Hellwig Date: Tue May 29 05:37:21 2012 -0400 zookeeper: do not overload event types Use different types for join requests vs responses, and block vs notify events intead of using the blocked field to overload the type. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit eefa308b968a609076c837d778115be55341efdc Author: Christoph Hellwig Date: Wed May 30 11:14:08 2012 -0400 collie: ensure images size always are a multiple of 512 bytes Sheepdog provides a block devices interface with a sector size of 512 bytes, ensure we never set image sized not a multiple of 512 bytes, as that would confuse consumer with the worst case beeing that the size gets truncated down when doing the naive shift down to get the sector count. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 90cb27018332c485a0bacb882e5d4c94df669811 Author: levin li Date: Wed May 30 17:48:55 2012 +0800 recovery: flush waiting requests after current recovery work was reset When a new recovery work is to replace the old one, we should flush the waiting requests after the recovering_work is set to the new one Signed-off-by: levin li Signed-off-by: Liu Yuan commit fd0334d688e0af14867c30af0b3de5f071eab746 Author: levin li Date: Thu May 31 10:22:58 2012 +0800 sheep: use queue_request when resume waiting requests When a waiting requests due to epoch inconsistency or object in recovery is resumed, we should check it again to determine whether the current condition can fulfill its need to make it run, so we call queue_request to reinitialize and check the request again. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 95a226a9a56cdfe3719166e5ff70751be0e2fc9d Author: MORITA Kazutaka Date: Wed May 30 00:35:55 2012 +0900 sheep: add a helper function to copy out data from strbuf This patch also fixes a problem that local_get_store_list() doesn't set a null terminated string. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 854927ff97898856209f5bbc7eee087599c536af Author: Christoph Hellwig Date: Tue May 29 05:14:43 2012 -0400 accord-driver: fix header location It seems like accord install accord.h directly into the include directory, not into a separate subdirectory. If anyone installs the headers into the sudirectory the code after this patch can still compile by using the right include path. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit a95e30f9ebace3b60122c3406c1467f0d458e937 Author: Liu Yuan Date: Wed May 30 16:31:19 2012 +0800 sheep: fix update_cluster_info() Even cluster is in the halt state due to lack of live nodes, it should allow new nodes to join in and get vdi bitmap. Signed-off-by: Liu Yuan commit bdee94f84f05ae385ac879966f9cf5a5aaf3ac2c Author: levin li Date: Wed May 30 12:04:50 2012 +0800 make recovery not to retry when recover_object_from_replica() fail Since we make sheep to wait to retry when epoch is inconsistent, recover_object_from_replica() will never get a response with SD_RES_NEW_NODE_VER, because the peer node will retry the request itself locally until epoch gets consistent. If epoch of request sender is old than the receiver, it would get SD_RES_OLD_NODE_VER, in this case, it means the epoch it's to increment and soon a new recovery work would replace the current one, we should not waste time recovering for the out-of-date recovery work, what we should do is to make the current recovery work cease to wait for replacement. As for SD_RES_NETWORK_ERROR, currently, recover_object_from_replica() will get SD_RES_NEWWORK_ERROR only if there's an EIO when reading the object, in this case we should not make recovery retry, because next time it may get an EIO either and so that make the recovery work hang there retrying constantly, we should make it retry another copies or in another epoch. Signed-off-by: levin li Signed-off-by: Liu Yuan commit cefd3a71f18f73f0e11e8f8ec3008b6e573ced35 Author: Christoph Hellwig Date: Tue May 29 07:00:22 2012 -0400 corosync: do not overload event types Use different types for join requests vs responses, and block vs notify events intead of using the blocked field to overload the type. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit c502cd61dc21174a12d8b8684dcf58ee00427932 Author: Liu Yuan Date: Wed May 30 00:50:31 2012 +0800 sheep: let forward_read_obj_req() read random copy Read random copy from cluster for better load balance, useful for reading base VM's COW objects Signed-off-by: Liu Yuan commit f4a13fb98da806580701a63b7f905040ea2ff988 Author: Liu Yuan Date: Wed May 30 00:47:44 2012 +0800 sheep: further refactor functions to use forward_read_obj_req() Signed-off-by: Liu Yuan commit 1b65b9e2a431039d645ad388f8c15e27d9cd7887 Author: Liu Yuan Date: Wed May 30 00:45:40 2012 +0800 sheep: refactor read/write_object() It would be nice if all the read/write goes to the forward_read/write_req_obj() This also cleans up the code a bit. But this patch goes further than refactor: - for now only vdi opreation calls read/write_object(), which originally use connect_to() but forward_read/write_req_obj() does a timeout read/write. - I'm planing to use these functions for object cache writethrough mode. - write_inode_cache() -> write_object_cache() Signed-off-by: Liu Yuan commit b7669b8e1b12449bd69d7ba948d168c4cfbf3242 Author: Christoph Hellwig Date: Tue May 29 03:33:08 2012 -0400 sheep: move more join code to update_cluster_info This keeps the successful join code in one place, and makes the changes in the next patch a lot easier. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 58d9c2b79814214ebbbcc661c3b5d0925f560b9c Author: Christoph Hellwig Date: Tue May 29 18:41:42 2012 +0800 sheep: store the node list in struct This will allow us to keep the previous node list around for recovery, and thus avoid reading the node list when starting recovery. I've decided to make struct vnode_info public as part of this even if I'm not entirely happy about it, but adding accessors for the node array seemed like an even uglier alternative. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 2695e3bc36ed142daf671cecbee1cf20816afbb7 Author: Christoph Hellwig Date: Tue May 29 03:33:06 2012 -0400 sheep: store nodes in sys->nodes in log_last_epoch We'll need to generate a vnode list for this special case later on, so remove the special case where we log a node list from outside of sys. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 49ce12dad7037c7423f056d1f401e66a8c2cd0d7 Author: Christoph Hellwig Date: Tue May 29 03:33:05 2012 -0400 sheep: consolidate checks for enough available zones Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 64bdb36f61d108f3720db692836cdc27f03709a5 Author: Christoph Hellwig Date: Tue May 29 05:48:47 2012 -0400 sheep: remove dead code in get_cluster_status Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 68b1ea209aea8f5a70e0338f1c3436e5d324d4da Author: Christoph Hellwig Date: Tue May 29 05:42:55 2012 -0400 sheep: remove the request_queue Offload I/O requests to the worker threads as soon as we get them in queue_request, or when walking one of the lists of delayed requests. Also move the code related to queueing I/O requests from group.c and recovery.c into sdnet.c to have it in a single place. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 9e6323f86577ed70ffc670aae890d2353fe0c5e9 Author: Christoph Hellwig Date: Tue May 29 05:42:40 2012 -0400 sheep: queue cluster and local requests directly Now that we can process events and requests in parallel, and events aren't queued anymore we don't need to queue requests either. Start with offloading local and cluster requests to the workqueues directly in this patch and leave the slightly more complex handling of I/O requests to the next patch. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 356998655d349be7f54ad9cedf02eebc433611f8 Author: Liu Yuan Date: Tue May 29 17:00:06 2012 +0800 object cache: use fcntl() for create_cache_object() Signed-off-by: Liu Yuan commit 62cbd0b8832e83ad9c4a048b2ac5f2cc56cbae71 Author: Christoph Hellwig Date: Tue May 29 16:51:38 2012 +0800 Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 9b8e0a657e4c280ec3cb50f1b7fefdc6401129e7 Author: Yibin Shen Date: Tue May 29 16:45:45 2012 +0800 read local base object firstly, then try a random copy in the cluster, this patch can loadbalance read traffic in a large scale cluster with lots of cloned VM with an identical base image effectively. [ fix coding style and adjust some comment - Liu Yuan ] Signed-off-by: Yibin Shen Signed-off-by: Liu Yuan commit 32e15499ee18f9d550f44e26fc6ab3ccb9025093 Author: Christoph Hellwig Date: Tue May 29 02:37:50 2012 -0400 sheep: check the response buffer size in get_obj_list Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit e30c7c6b566f08470a7e1a61134f6b3b811f2109 Author: Christoph Hellwig Date: Tue May 29 02:37:49 2012 -0400 sheep: use a normal buffer for the objlist cache The oid list isn't a string, so simply using a dynamically reallocated array makes the code simpler and earier to understand. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 743b673f9868d4be80780b33cc03485ccd0b6f83 Author: Christoph Hellwig Date: Tue May 29 02:37:48 2012 -0400 sheep: directly insert entries into the objlist cache on startup Remove the get_objlist method and just let the store driver call objlist_cache_insert for every entry it finds. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 8a006090c35d4455ee03db0494ad7daa8082a4be Author: Christoph Hellwig Date: Tue May 29 02:37:47 2012 -0400 sheep: hide objlist implementation better Add a new helper for the full objlist removal implementation to object_list_cache.c and thus allow hiding struct objlist_cache there. Rename check_and_insert_objlist_cache to objlist_cache_insert to describe it's purpose better and keep function names uniform. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit de12dcb0cf1a72620e8c58927bd69b73465cb4a9 Author: Liu Yuan Date: Mon May 28 22:51:24 2012 +0800 sheep: remove useless var in do_recover_object() No logic inside the function references it at all. Signed-off-by: Liu Yuan commit d7469e3e4e61f37eddb2be61f9c563fed4adbd82 Author: Christoph Hellwig Date: Mon May 28 22:00:59 2012 +0800 Use struct vnode_info to hold vnode lists in recovery instead of opencoding the data structures. This helps to keep the code simpler, especially in do_recover_object which did mind-boggling copy games before. [ Rebase to master - Liu Yuan] Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 44e5cd6dd1534d6727579e781b4f15cfe6b9dbe5 Author: Christoph Hellwig Date: Mon May 28 08:56:50 2012 -0400 sheep: add back oid_to_vnode Recovery wants to use this helper, so add it back. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit a2ebeaaf45734f2f4bb71ee79ff7564ab020cf24 Author: Christoph Hellwig Date: Mon May 28 08:56:33 2012 -0400 sheep: factor out a helper to allocate a struct vnode_info We want to use struct vnode_info in recovery code and thus need a helper to allocate on for a given node list. Factor update_vnode_info to allow for that. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit a4c9b451fef05e30fcff2d758ab2cadf52b3d48a Author: Christoph Hellwig Date: Mon May 28 07:59:31 2012 -0400 local: remove the unused local_block_wq workqueue Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit bd9961dd22c99e4e42fd9d3e2ac1307224a35717 Author: Christoph Hellwig Date: Mon May 28 07:58:40 2012 -0400 local: avoid overloading event types Use different types for join requests vs responses, and block vs notify events intead of using the blocked field to overload the type. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit e9b05bb6123683e7024be736616c2de37b250554 Author: Liu Yuan Date: Mon May 28 21:25:42 2012 +0800 farm: fix unalign memory alloction This bug will be triggered by farm_link(), which calls retrieve_object_from_snap() that pass the unaligned buf to misfortune farm_atomic_put(). Signed-off-by: Liu Yuan commit 1a6dc33e05407b9a50ff97854455e0ff2d8679c9 Author: Christoph Hellwig Date: Mon May 28 07:41:10 2012 -0400 sheep: process events directly Now that events don't wait for outstanding I/O requests there is no good need to delay their processing in a queue. In fact we can already handle notification and leave events directly without problems, only the vdi_inuse bitmap update in the join event prevents us from process all events directly. This patch moves the vdi_inuse update into the blocking workqueue, so that we can make sure it is processed before any VDI events, while we can update the cluster status and start recovery independently and remove the complex code to queue and serialize events. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit edc1b311cebac82d9438150e718e2b926804c277 Author: Christoph Hellwig Date: Mon May 28 06:57:53 2012 -0400 add list_move and list_move_tail And use them to optimize list_del/list_add_(tail) pairs. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit ef9395e311c09807a154f46436c3ec3a2b0949f4 Author: levin li Date: Mon May 28 18:08:45 2012 +0800 use binary search in newly_joined() We can sort the old node list before recovery start, then we can use binary search in newly_joined() to make it faster. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 95cf347ea4f28c74fc433ea3786fe19ab4032aba Author: Liu Yuan Date: Mon May 28 16:06:17 2012 +0800 sheep: let outstanding IO req doesn't block confchg event We already define the in-fly IO object as busy object, which sit on the sys->outstanding_req_list. So recovery request for this object will be queued on sys->req_wait_for_obj_list, where it will be resumed later. So there is no need to block confchg event for outstanding IO thus confchg could be processed as soon as possible. Confchg should take precedence over IO req because: Suppose doing heavy IO on each node while cluster is in recovery. Every node is issuing IO request while doing recovery. Both outstanding IO and unfinished confchg event blocks each other (nearly dead lock), all nodes are busy retrying those pending I/Os (live lock), and recovery requests are mostly denied of service, neither outstanding IO nor recovery moves on to completion. farm_write()'s epoch check function as a safe guard for follwing case from Kazutaka If there are 1 node, A, and the number of copies is 1, how does Farm handle the following case? - the user add the second node B, and there is in-flight I/Os on node A - the node A increments the epoch from 1 to 2, and the node B recovers objects from epoch 1 on node A - after node B receives objects to epoch 2, the in-flight I/Os on node A updates objects in epoch 1 on node A. - node A sends responses to clients as success, but the updated data will be lost?? Signed-off-by: Liu Yuan commit 4d7b5a8dff46715ea1e55304736fec9d307c93b9 Author: levin li Date: Mon May 28 12:14:06 2012 +0800 sheep: make gateway requests only retry when requested object is local We should not make gateway retry in check_request when the requested object in recovery is not local, in the case of requesting a non-local object we should make it retry in io_op_done(), if the requesting object is local, then we should make the gateway request to retry. Signed-off-by: levin li Signed-off-by: Liu Yuan commit d44fa1fc1822beff4756cafb6bcf825f0ccf28df Author: levin li Date: Mon May 28 11:07:47 2012 +0800 recovery: fix a race condition in recovery Take consider of this scene: Node A and B are in recovery A is recovering object x from B, and object x hasn't been recovered by B. B is recovering object y from A, and object y hasn't been recovered by A. Then B will response A with result SD_RES_NEW_NODE_VER, and A will also response B with result SD_RES_NEW_NODE_VER, then A and B will continually retry to recover object x and y, but always get an response SD_RES_NEW_NODE_VER, neither success, so here's a dead lock which stops the recovery from completing. Signed-off-by: levin li Signed-off-by: Liu Yuan commit e275fe18361ce9e40f84c379765c7243929c37ba Author: levin li Date: Thu May 24 19:24:50 2012 +0800 recovery: clear the object wait queue when new recovery work comes When a new recovery work replaces the old one, we should clear the waiting requests in the wait_obj_queue to make them retry for the new work, or else, the requests in the queue may never be waked up. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 9c15d80ed8de73201cef18d78f3426d739cbf4eb Author: levin li Date: Thu May 24 19:06:28 2012 +0800 recovery: make IO request to wait when the requested object is in recovery When an object requested is in recovery, we should put it into the wait_obj_queue to make the request wait until the object is recovered by the recovery work. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 40a99977201bf840d9e763aac1bee89bd44ca19f Author: levin li Date: Fri May 25 09:51:20 2012 +0800 recovery: make IO request to wait when recovery is in RW_INIT When recovery is in RW_INIT state, the IO request would be marked as recovering, then the sender will busy retrying, we should make the request wait until recovery enters RW_RUN state to determine whether the object requested is in recovery instead of just return SD_RES_NEW_NODE_VER. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 87e922fce90a5399fd62a6a61af2f1511b55b9d2 Author: levin li Date: Mon May 28 11:40:05 2012 +0800 sheep: make gateway to retry when received SD_RES_OLD_NODE_VER When gateway requests get SD_RES_OLD_NODE_VER in io_op_done(), it means epoch of gateway is older than peer, and peer has set response's epoch with its system epoch, then gateway should check whether its system epoch has reach peer's system epoch which in req->rp.epoch, and whether the result is SD_RES_OLD_NODE_VER, if so, the request should be pushed into wait_epoch_queue to wait until system epoch change, but if system epoch has reached peer's system epoch, retry this request at once. Signed-off-by: levin li Signed-off-by: Liu Yuan commit e9467a89be92945cbb51772c68bad19300047956 Author: levin li Date: Mon May 28 11:29:05 2012 +0800 sheep: make requests with new epoch sleep until epoch is updated If requests comes with epoch newer than system epoch, then we shouldn't just make it done with result SD_RES_NEW_NODE_VER, if so, the sender would busy retrying this request, which may casue CPU too busy to process other request. We push the requests with new epoch into a wait_epoch_queue to make it wait for epoch consistency, after epoch changes we wake up these requests in the queue, which avoids busy retrying. Signed-off-by: levin li Signed-off-by: Liu Yuan commit ec1348d7db276c206d57ace8eabf1e3b757803d2 Author: levin li Date: Thu May 24 10:43:46 2012 +0800 sheep: port list_splice_tail_init() from linux kernel Signed-off-by: levin li Signed-off-by: Liu Yuan commit 23a7b6af760b7cb4b74a8fbb0dce63b3fbd96070 Author: levin li Date: Fri May 25 15:55:57 2012 +0800 fix a race in get_obj_list() Reseting the object list buffer should not be locked by the reader lock, but should be locked by a writer lock, or else it's a race. Signed-off-by: levin li Signed-off-by: Liu Yuan commit bc971b509d5304e5c301be0e79febaca2a588ec9 Author: MORITA Kazutaka Date: Sun May 20 11:59:05 2012 +0900 configure: exit if urcu/uatomic.h is not found Signed-off-by: MORITA Kazutaka commit be478244da0f4c0ddcee158edc96fc6bf6aca6e8 Author: Liu Yuan Date: Mon May 28 14:15:59 2012 +0800 sheepfs: add -okernel_cache option This option can enable us to make advantage of the kernel's page cache when run sheepfs with -k(--pagecache) option. For difference with direct-io (default)mode: 1 page cache has the inherent limits of max read pages, that is 128k. So a read requests more than 128k will be split before presenting to sheepfs. 2 direct-io mode means all the requests are directly sent to sheepfs. But fuse's kernel module has a default max read pages macro, 128k. If one wants to send reqeusts bigger than 128k and expects not to be split, he needs to change the macro as big enough, say 4M, in case he wants to send 4M requests in one go. This macro is located in kernel_src/fs/fuse/fuse_i.h, named as FUSE_MAX_PAGES_PER_REQ by page uint. NOTE: The max write size is still limited to 128k. I am still poking a means to remove this limit. This probably needs to patch FUSE kernel module code. Signed-off-by: Liu Yuan commit d9a4d7f3b8e61891666ca1c577af51bd73bf6791 Author: Liu Yuan Date: Sat May 26 17:19:29 2012 +0800 sheepfs: introduce dedicated printf funciton Now with -f option, we can redirect printf output to stderr and also save us from manual adding function name and function line to printf. Signed-off-by: Liu Yuan commit 97ab6b38e33913f5ce12d07be23b7c8ec5b2edbf Author: Liu Yuan Date: Fri May 25 18:30:18 2012 +0800 object cache: use read/write lock This allow concurrent read access to the same object, which is crucial to cloned VMs which shared one base in the same node. Signed-off-by: Liu Yuan commit 532074d556dba8ac2fa62533d0360b6d1daa34c7 Author: Christoph Hellwig Date: Fri May 25 10:41:47 2012 -0400 sheep: recovery needs to update the objlist cache recover_object_from_replica can write objects to disk without going through store_create_and_write_obj, and thus currently fails to update the object list cache. This can lead to incorrect GET_OBJ_LIST output and thus missing replicas for objects that need to be recovered again after they moved onto a new node. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit cbcd14bbde918439945bdec42fe4987ce0bba7af Author: Yunkai Zhang Date: Wed May 16 17:46:54 2012 +0800 sheep: remove check_majority() When sheep receives LEAVE event, check_majority() will be executed in __sd_leave(), it'll make network very busy as it try to connect all sheeps each other. I don't think this checking is necessary, that is driver's work. Driver will tell us which sheep is alive and which have left. So let's remove this checking. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 5160e602cffd94625246955af4046b79ef6b6089 Author: levin li Date: Thu May 24 17:32:10 2012 +0800 recovery: avoid recovering object from node left In the recovery path, sheep may get to old epoch at which some nodes have left the cluster, we shouldn't try to recover objects from these nodes, so I add a check function to check whether the target node is a valid node at current epoch. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 76908b80a0741004319ba05d7ac327ca2665923b Author: Liu Yuan Date: Thu May 24 19:57:43 2012 +0800 configure: disable sheepfs automatically if libfuse-dev isn't found Signed-off-by: Liu Yuan commit 9e1c8f023b9b91ffc4e722433016e638a31e8ebc Author: Liu Yuan Date: Thu May 24 13:42:09 2012 +0800 sheepfs: use glibc's xattr header Then we don't need install extra xattr devel package Signed-off-by: Liu Yuan commit c2bd5b65a0c1749355cff0a420c6638874c06d06 Author: Christoph Hellwig Date: Wed May 23 14:03:20 2012 -0400 only build the sheepfs subdirectory if sheepfs is enabled Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 43c2ff8600540226c3302818f179701c9b5e30cc Author: Christoph Hellwig Date: Wed May 23 10:24:09 2012 -0400 zookeeper: mark internal symbols static .. and avoid useless forward declarations for them. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit d9a681dcb1f85273f5b88a4f9baad6a76495d15f Author: Liu Yuan Date: Mon May 21 23:25:59 2012 +0800 sheepfs: add config entry config exports internal state of sheepfs and enable us even to change those state on the fly. For e.g, control over which sheep to connect: echo 192.168.10.113:7000 > sheepfs_dir/config/sheep_info Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit ad5057876cc61551126b50cec0b89ee1d772a5f4 Author: Liu Yuan Date: Mon May 21 23:25:58 2012 +0800 sheepfs: teach volume to read/write COW objects Now we can opreate on the volume with cloned vdi, for e.g, we can boot up vdi named of 'clone' by below command: $ qemu-system-x86_64 --enable-kvm -m 1024 -drive \ file=sheepfs_dir/volume/clone,cache=writeback tailai.ly@taobao:~/sheepdog$ cat sheepfs_dir/vdi/list Name Id Size Used Shared Creation time VDI id Tag c clone 1 20 MB 0.0 MB 20 MB 2012-05-14 12:01 72a1e2 s test1 1 20 MB 20 MB 0.0 MB 2012-05-14 11:57 fd32fc snap test1 2 20 MB 0.0 MB 20 MB 2012-05-14 11:58 fd32fd - add an option to disable 'object cache' for volumes Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit efca4a34666599a98287534cd89a41ddb05dd711 Author: Liu Yuan Date: Mon May 21 23:25:57 2012 +0800 sheepfs: add options to pass the address and port of the sheep Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c04a3ada74f0753c1c56d7cabcb96192d867e305 Author: Liu Yuan Date: Mon May 21 23:25:56 2012 +0800 sheepfs: add a socket pool to speedup connection Socket pool is used for FUSE read threads, which use threads to simulate aysnc read. All sockets point to the same gateway. - add a read/write lock to be thead safe. - add an option to dis/enable page cache for volumes. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 837f91411747f04d838d18f31c78c104a7fbc71f Author: Liu Yuan Date: Mon May 21 23:25:55 2012 +0800 sheepfs: add 'node' entry Support vdi node list & info. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit b112f08248eaad19e58314af99e927bc8548697f Author: Liu Yuan Date: Mon May 21 23:25:54 2012 +0800 sheepfs: teach volumes to unmount This is how we can deattach the volumes from sheep cluster storage. Usage: echo vdi_name > sheepfs_dir/vdi/unmount Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 7869802a8a937b05f85ef4557f627e26eaba226d Author: Liu Yuan Date: Mon May 21 23:25:53 2012 +0800 sheepfs: implement 'open' operation This is useful to control whether we use page cache or not. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit affd2bb86cf025522cd0766a2b7419f2f8b3f863 Author: Liu Yuan Date: Mon May 21 23:25:52 2012 +0800 sheepfs: implement 'sync' operation for volumes When we use page cache to cache vdi in sheepdog storage, we need sync operation to flush dirty bits back into sheep storage. This is wired up to the 'writeback' of the kernel's page cache. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 8d08b89d81541283e484446605381d396d26fa9b Author: Liu Yuan Date: Mon May 21 23:25:51 2012 +0800 sheepfs: add 'volume' entry This is where we can access sheepdog's storage from well received 'file' abstraction. All the attatched volume will be seen as a file in volume directory. To attatch a volume: echo test > sheepfs_dir/vdi/mount Then you will see a file entry in sheepfs_dir/volume/test, which you can do the tricks you are fond of. For, e.g, we can boot an sheepdog image as normally as for an ordinary raw image: $ qemu-system-x86_64 --enable-kvm -m 1024 -drive \ file=sheepfs_dir/volume/test,cache=writeback. This file abstraction integrates well with kernel's other component such as pagecache. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit f742a2178627fd02eb4ef34f9418c796af26ef6c Author: Liu Yuan Date: Mon May 21 23:25:50 2012 +0800 sheepfs: implement shadow file mechanism Sheepfs use this shadow file mechanism to mostly manage dentries. We might also make use of those shadow file to cache non-volatile states. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 3969b8d6bd8ceac2aa414f9eaa495f72f8687b5e Author: Liu Yuan Date: Mon May 21 23:25:49 2012 +0800 sheepfs: add 'vdi' entry Support 'collie vdi list', mount & unmount (used by volume) Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 47771ebcbce951128b0fe14891eff742075631a9 Author: Liu Yuan Date: Mon May 21 23:25:48 2012 +0800 sheepfs: add 'cluster' entry This export cluster state as collie cluster command. You can read the those info by simply ordinary read/write. For e.g, tailai.ly@taobao:~/sheepdog$ cat sheepfs_dir/cluster/info Cluster status: running Cluster created at Mon May 14 15:45:37 2012 Epoch Time Version 2012-05-14 15:45:38 1 [127.0.0.1:7000, 127.0.0.1:7001, 127.0.0.1:7002] Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 4a3b9708572d5ae163e7ebe5e19444edb3d577c4 Author: Liu Yuan Date: Mon May 21 23:25:47 2012 +0800 sheepfs: core infrastructure Sheepfs is FUSE-based pseudo file system in userland to access both sheepdog's internal state (for e.g, cluster info, vdi list) as well as sheepdog's high reliable stroage. This building block is supposed to be modular enough to allow us ease adding more powerful tools. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 1efc480f31e33328e674789b40edbb4a14b83f39 Author: Liu Yuan Date: Mon May 21 23:25:46 2012 +0800 sheep: move strbuf and rmdir_t into lib Those are needed by sheepfs. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 76e32d824067407356ed89e2b3900192290b05ad Author: Liu Yuan Date: Mon May 21 23:25:45 2012 +0800 sheepfs: modify configure file to work with sheepfs Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 0fc006373cb394a6d8fbda0f77803ee3ac039bba Author: Christoph Hellwig Date: Wed May 23 05:18:21 2012 -0400 sheep: move gateway handling out of store.c Create a new gateway.c file for the gateway handling instead of mixing it up with the unrelated backend store. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 231396cdf90f6ceb109ee36d7cea7a0e60f7cca6 Author: hch@infradead.org Date: Wed May 23 05:18:02 2012 -0400 sheep: split do_io_request Split do_io_request into a local store and a gateway version to make the code more obvious. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 18dcd92ec527f69bd0b296ba4283be70ed1a95f1 Author: levin li Date: Wed May 23 15:47:57 2012 +0800 obj list cache: add a buffer to object list cache as a second level cache It's inefficient for get_obj_list() to traverse the object list rbtree to get the object list every time, so I add a buffer to cache the list, every time in get_obj_list() if the rbtree has changed which is detected by the version number, we update the buffer, then we can just read the buffer by memcpy() if the object list has not been changed, instead of traverse the full rbtree. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 68b9ec785afa96711ea22392906ad5e8b9366596 Author: Liu Yuan Date: Wed May 23 19:01:22 2012 +0800 sheep: fix recovery logic Current recovey will fail for following simple test case (really surprised) ===== set -ex pkill -9 sheep rm store/* -rf # start three sheep daemons for i in 0 1 2 3; do ./sheep/sheep -d /home/tailai.ly/sheepdog/store/$i -z $i -p 700$i -W done sleep 1 ./collie/collie cluster format -c 2 ./collie/collie vdi create test0 40M ./collie/collie vdi create test1 40M pkill -f "sheep -d /home/tailai.ly/sheepdog/store/3" ===== after running the script, we can see copy isn't recoveried! tailai.ly@taobao:~/sheepdog$ find -name '80fd32fc00000000' ./store/3/obj/80fd32fc00000000 # <-- this node is killed ./store/0/obj/80fd32fc00000000 With the patch, we get a expected result: tailai.ly@taobao:~/sheepdog$ find -name '80fd32fc00000000' ./store/1/obj/80fd32fc00000000 # copy migrated from node 3 ./store/3/obj/80fd32fc00000000 # killed ./store/0/obj/80fd32fc00000000 The failture is rooted in the original algorithm: we just did one shot search and didn't try a breadth-first search before diving into older configuration. The fix is rather straightforward, do a breadth-first search!. With the patch, we also end up with much more simplified code. Signed-off-by: Liu Yuan commit c0f5a19e22245b609ea97cd22a77539c8912dd86 Author: Liu Yuan Date: Wed May 23 19:01:21 2012 +0800 farm: fix read_working_object() We should clear O_DIRECT flag for vdi object Signed-off-by: Liu Yuan commit d3a4f51b42519235838063f8706e97135392e6a5 Author: levin li Date: Mon May 21 12:11:03 2012 +0800 start traversing from a random node in fill_obj_list() Every node has the same sd_node order in its epoch, so in fill_obj_list(), every node starts from a same node to request the object list, which may cause the node overload. Indeed, we meet this problem when there's 960 nodes in our cluster, when in the period of fill_obj_list, some node get 'too many requests' in client_rx_handler(), so I change it to start from a random node in fill_obj_list() to make load blance. Signed-off-by: levin li Signed-off-by: Liu Yuan commit ed2d7483caa8ac24e5435d183c02b29cff34a0f4 Author: Liu Yuan Date: Mon May 21 23:07:42 2012 +0800 object cache: fix bmap calculation It uses 64 bits to represent 4M objects, so each bits will represent 64K data or object_cache_flush_and_delete() will fail. - add two eprintf for read/write_cache_object() Signed-off-by: Liu Yuan commit 589382c625846ad85fdb947611e30b30c650be25 Author: Liu Yuan Date: Tue May 22 10:44:00 2012 +0800 sheep: remove useless type casting Signed-off-by: Liu Yuan commit 82bff8b547f93953996e90b1b77f1f48a78a0868 Author: Liu Yuan Date: Tue May 22 10:43:59 2012 +0800 doc: add a API documentation for strbuf This is excerpted from 'git' Signed-off-by: Liu Yuan commit 0d8e273470fa2775489ea1507eb858751644b454 Author: Liu Yuan Date: Tue May 22 10:43:57 2012 +0800 script: add a script to transform simple store into Farm Usage: script/simple2farm /path/to/store/obj Signed-off-by: Liu Yuan commit f4b92c40b2213d82602dcc5f008d14e7239aab33 Author: MORITA Kazutaka Date: Mon May 21 00:48:10 2012 +0900 sheep: update sd_store only when the correct driver name is specified Users may run 'collie cluster format -b ?' to get the store driver list. In such cases, we shouldn't update sd_store. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 160e353f48b3baccf250e31f51aa388edfcab581 Author: MORITA Kazutaka Date: Sun May 20 23:39:20 2012 +0900 sheep: set sys->epoch correctly before becoming master This fixes a bug introduced by commit 43a2f684e, and necessary to pass the following testcase: #!/bin/bash set -ex for i in 0 1 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done ./collie/collie cluster format for i in 0 1 2; do pkill -f "sheep /store/$i"; sleep 1; done for i in 1 0 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done for i in 0 1 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done for i in 0 1 2; do ./collie/collie cluster info -p 700$i; done Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 9103d13a163450fa91e2f25b21e950bc907d3964 Author: MORITA Kazutaka Date: Sun May 20 23:42:17 2012 +0900 cluster/accord: fix compile errors Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 278650b5a9a1481a6502098aab9042f8981a8919 Author: MORITA Kazutaka Date: Mon May 21 01:11:53 2012 +0900 farm: use access(2) to check file existence This simplifies the code a bit. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 68c5d63d9bee116ec5f22b9473b4d3bf6f88046c Author: Christoph Hellwig Date: Sun May 20 13:51:53 2012 -0400 consolidate the MAX_EVENT_BUF_SIZE definitions The accord, local and zookeeper drivers all need an upper bound of the message size the sheepdog might send for their communication queues. Centralize the definition into a single place, and make sure we never send larger messages from the core sheepdog code. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 9b1536a3b5b4926aa3a2c40f9447ac54c511a336 Author: Christoph Hellwig Date: Sun May 20 13:51:33 2012 -0400 local: do not zero the event queue Reading, including through mmap from sparse files always returns zeroes, so there is no need to memset the shm queue. Zeroing it causes backing pages to be allocated to it in tmpfs, and thus increases the memory usage for a not fully used queue. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit ed3dc6a3b67af0335035e44dcb2df2859413b91d Author: Christoph Hellwig Date: Fri May 18 07:40:17 2012 -0400 sheep: lock against concurrent access to the store directory This patch uses lockf on a specially create lock file in the base directory to detect early on if another sheep is using this directory and abort with a useful error message. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 827d31ab1ff3513b1ce4c3d19760646c420eea25 Author: Yunkai Zhang Date: Fri May 18 18:28:39 2012 +0800 zookeeper: fix master transfer logic When a sheep joins into the cluster, master will call sd_check_join_cb() to get the join_result which will be update to ev.buf, and all sheep will receive this update. If join_result equals to CJ_RES_MASTER_TRANSFER, master will kill itself by exit(). Zookeeper needs at most SESSION_TIMEOUT to detect master's leaving action, it's better to call zk_leave() explicitly before master exit. On the other hand, other sheeps will continue to process the updated JOIN EVENT. But now, Sheepdog assumes that only one sheep(the joining sheep) is alive in MASTER_TRANSFER scenario, this can simplify processing logic(maybe we will overthrow this assumption in the future for other corner-case). Based on this assumption, the joining sheep just need to reset its member_list(saved in node_btree in zookeeper driver), make it only contains itself. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit c6b6ee2368b1c73b05fe03cf7567957368c9882e Author: Yunkai Zhang Date: Fri May 18 18:28:38 2012 +0800 zookeeper: fix node_btree_clear() reset zk_nr_nodes to 0 when clear node_btree Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit b6b6fef8832630ef32bb40b4a9bd7d63eba2467c Author: Liu Yuan Date: Fri May 18 22:06:10 2012 +0800 object cache: use the correct unlock flag for lockf() Manual says we should use F_ULOCK to unlock lockf() Signed-off-by: Liu Yuan commit 7e4544eb301fdde504fe3bf6ae82ea9fde255e23 Author: Liu Yuan Date: Sun May 20 23:39:26 2012 +0800 makefile: fix an error when no args for 'rm' This patch fixes the following err for 'make clean': ... find -name '*.orig' -or -name '*.rej' | xargs rm rm: missing operand Try `rm --help' for more information. make[1]: *** [clean-generic] Error 123 Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 70eec55a76a46d40e916a45ee1701cf90509d23b Author: Christoph Hellwig Date: Fri May 18 05:30:40 2012 -0400 sheep: remove the dispatch handler Now that we don't unregister and re-register the cluster driver event FD the need for the dispatch method goes away, instead the cluster drivers can handle their events locally. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 6e001a7767b7b204251f99fe5a3b5821a0bc69d6 Author: Liu Yuan Date: Sat May 19 23:11:25 2012 +0800 store: remove get_store_dir() It is no longer used after simple is removed Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 9307f57d85e336459ba25dfa15c343b6f4b87d7e Author: Liu Yuan Date: Sat May 19 23:11:24 2012 +0800 store: remove update_epoch_store() It is no longer used after simple store is removed Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit d286a9a02e1b7b58162c2cf969223da66ecb2e91 Author: Liu Yuan Date: Sat May 19 23:11:23 2012 +0800 fix sys->epoch race We shouldn't use sys->epoch directly in worker threads. Add a atomic helper for it. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit fd94a64474aa5dd5999ec60789c90f97453a89f8 Author: Liu Yuan Date: Sun May 20 21:00:07 2012 +0800 farm: refactor core code refactor strcut siocb, this makes the code more clean. - let sd_store->read/write() use open() internally. - remove sd_store->open()/close() - add sd_store->exist() But now we don't actually have concurrent requests to the same object, because upper layer have exclude concurrent access by check_request(). We'll remove this constaint by later patch set. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 45bd3104867c6340f8c65057226dd6387de4c304 Author: Liu Yuan Date: Sat May 19 23:11:21 2012 +0800 remove simple store support Simple store has many constraints, to name a few: 1) can't handle stale objects easily 2) can't support concurrent access to the same object easily 3) need outstanding IO to block confchg Currently Farm runs as well as expected, both by performance and stability, so we'd better remove it and redesign a abstracted store layer oriented for Farm. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 423bc80ad7c160083e08f3995e94ecc070d252ad Author: Liu Yuan Date: Sat May 19 23:11:20 2012 +0800 collie: use 'farm' as default store driver Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit d9b041e5762615dc2c292b3e50642ca948edebdd Author: Liu Yuan Date: Sat May 19 22:48:41 2012 +0800 makefile: remove .orig .rej file Old script doesn't work well with the subdir such as a sheep/farm/*.orig, so use 'find' to find the all the files. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit cada69b0ac6f738ff2b766cb4f385816781fd94e Author: Liu Yuan Date: Sat May 19 18:35:29 2012 +0800 farm: remove stale objects for crashed sheep when it comes back During the window the crashed sheep restarts, the objects might be updated. So we need to purge store/obj directory to assure consistency. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 2210dfc9bf63c2c02bafd4f1a559e4ca4a519544 Author: Yunkai Zhang Date: Thu May 17 23:01:38 2012 +0800 sheep: remove unregister_event from process_event_queue() In old code, we call unregister_event(cdrv_fd, ...) in process_event_queue() when sheep receives cluster EVENT, we will register cdrv_fd into epoll again in event_done() after __sd_xxx() finished. This is dangerous! In our testing, for some reason, __sd_xxx() may be blocked by network issue, as a result event_done() would not be executed, and cdrv_fd would keep outstanding from epoll, then all new coming EVENT could not be process immediately. This will make sheep hard to complete recovery. Now, we call update_cluster_info() in sd_xxx_handler() directly so that we can process new EVENT one by one immediately, and needn't to wait previous EVENT's __sd_xxx() finished. So we can remove unregister_event() from process_event_queue() safely. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit b3359b34686b7095d24621b78772f3c49b1210f4 Author: Christoph Hellwig Date: Thu May 17 11:13:56 2012 -0400 sheep: fix node comparism in log_last_epoch We only logged the joining node instead of all previous nodes due to a rebase error over the node_eq changes, causing major confusion during recovery. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 02860d165c8fa2d421a4674d11b1003a182b5d05 Author: Christoph Hellwig Date: Thu May 17 11:23:59 2012 -0400 corosync: remove struct corosync_block_msg This structure was obsoleted by a recent patch and should have been deleted. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit cf308c18facb2ba7343ba59b15cd441f050c85d5 Author: hch@infradead.org Date: Thu May 17 06:34:10 2012 -0400 sheep: mention -o/--stdout in --help output Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit c6c72cbaa01ec6ea0d9f2099b970231e1f2dbde4 Author: Christoph Hellwig Date: Thu May 17 06:29:07 2012 -0400 use unions for protocol request and response structures Doing so greatly decreases the amount of casting and improves type safety. Also switch from struct assignments to mempcys in a few places where we copy them around. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 43a2f684e1217104e3e04bbcf3ec3ca6aa8e3318 Author: Christoph Hellwig Date: Thu May 17 04:15:16 2012 -0400 sheep: factor node list updates Add a new helper that updates sys->nodes and sys->nr_nodes as well as the vnode list, and use it in all places that update cluster membership. In __sd_leave_done it can be used as is and gives a nice cleanup, in the master transfer case sd_join_handler it can also be used as is, but I've added an assert for the previously implicit assumption that no other nodes can exist. The tricky case is update_cluster_info/finish_join, where we first need to write an entry into the epoch log for the epoch before the joining code so that the recovery code can do the right thing. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit f348f0925bcf96770f827fc4adabf26fcd500021 Author: Christoph Hellwig Date: Thu May 17 04:14:51 2012 -0400 sheep: pass a node list to update_epoch_log Assuming some arguments come from sys while the epoch is passed is a bad convention. In addition the next patch will add a caller that doesn't take the nodes array from sys. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 0d8f61a3e1f3707f0d5a9ecba52f7fc3f8944736 Author: Christoph Hellwig Date: Thu May 17 04:13:33 2012 -0400 sheep: merge join() into the only caller .. and clean up the result a bit. This makes the code a bit more easily understandable for the bigger changes to follow later. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit fcacec74b61e015a0854c7483787ba8dc68efcae Author: Christoph Hellwig Date: Thu May 17 16:59:59 2012 +0800 sheep: pass vnode_info to the object cache There is no need to grab a local copy of the vnode information in the object cache, as the callers can pass it. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit fa2f71feb75aae9087d7d43bc14b37562f1b255e Author: Christoph Hellwig Date: Thu May 17 16:59:04 2012 +0800 sheep: pass vnode_info to VDI operations There is no need to grab a local copy of the vnode information in the VDI ops, as we already get it passed in through struct request. For deletions we have to grab an additional reference as deletions may still be in progress by the time finish_request is called. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit b67ed0dc9b133de1207e11fc4db6ed2f2dc691ba Author: Christoph Hellwig Date: Thu May 17 16:58:06 2012 +0800 sheep: use urcu atomics for the vnode_info reference count Using atomic operations for the reference count of the vnode_info structure allows for grabbing additional references outside the main thread. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 1adde5892a245bee17e77a1c2a57ac67c8c84361 Author: Christoph Hellwig Date: Thu May 17 16:56:59 2012 +0800 sheep: pass a struct request to ->process_work The main process_work now gets a struct request passed to it to unify the calling conventions for all instances, and allow using the vnode information from the VDI operations in the next patch. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 9168c368b90ccb2c7e8bd6428c4ba4bba48d7c47 Author: Liu Yuan Date: Thu May 17 15:25:59 2012 +0800 script: add a cleanpatch from Linux kernel source This would ease the patch with trivial issues, and speed up patch submition process a bit. Signed-off-by: Liu Yuan commit 48fde8304c3fa713cf549ad5523b03151917509a Author: Christoph Hellwig Date: Wed May 16 03:04:05 2012 -0400 sheep: rewrite blocked notifications The prime AIM of this patch is to fix the racy access to sys->pending_list in do_cluster_op, but it actually cleans up the surrounding code massively as well. It contains three tightly related changes: - split a new ->block operation from ->notify. It is used to tell the cluster driver to block new events, but does not contain a message by itself yet. - the block_cb callback previously passed to ->notify is not passed to ->block any more, but a new sd_block_handler callback is provided that can be called from the cluster driver in main thread context. sd_block_handler takes care of grabbing the first request from sys->pending list in the main thread, and then scheduling a workqueue to handle the cluster operation - a new ->unblock cluster operation is added which is called from the ->done handler of the block workqueue to tell the cluster driver to unblock the event processing, as well as sending the message with the results from the main processing (or simplify the cluster wide notification if there is no work routine in the ops table) Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 3760b41d4a3efb41c06e928a013820232167e3f5 Author: Christoph Hellwig Date: Thu May 17 02:16:17 2012 -0400 corosync: avoid useless allocations in cdrv_cpg_deliver Only two of the five cases in cdrv_cpg_deliver need to allocate a new cevent structure. Move the allocation of it to the places that actually need it. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit bfbfeefb605760738218fdd0fcea2713f7a27c69 Author: Liu Yuan Date: Thu May 17 10:56:04 2012 +0800 object cache: move calc_object_bmap() out of lock It doesn't need lock protection at all. - remove unnecessary check against len, because it will never be 0 - add a warning in push_cache_object() to indicate nothing to flush, if this happens, we have a bug. Signed-off-by: Liu Yuan commit f4dd46c684ea910ff2b2494bd59aa10962fbc1f8 Author: MORITA Kazutaka Date: Wed May 16 10:16:09 2012 +0900 sheep: add an option to disable object cache It is highly recommended to read the document carefully before using the object cache. https://github.com/collie/sheepdog/wiki/Backend-Stores-and-Object-Cache This patch is useful for users whose environments don't meet the requirements (e.g. sheep requires additional store on the gateway nodes, we shouldn't access the same image from the different nodes even if it is not the same time, etc). If you are unsure how the object cache works, it's safe to disable it. In future, it's better to support multiple cache features because the requirements for the sheepdog write cache seem to be quite different among users. Signed-off-by: MORITA Kazutaka commit eccdd7c75d6567a4bd469aaf9dd702a6c25054fe Author: MORITA Kazutaka Date: Wed May 16 09:53:03 2012 +0900 sheep: read cached data if it exists in read_object() Signed-off-by: MORITA Kazutaka commit b1ac6152fbab78a88bdd54e3d65add6271b9a044 Author: levin li Date: Mon May 7 13:45:32 2012 +0800 fix a bug of deleting base vdi fail Take a view of the following snapshot tree: base vdi --> snapshot vdi --> cloned vdi when cloned vdi has its own data objects created by copy-on-write, we firstly delete the cloned VDI, then delete the base VDI, at last we delete snapshot VDI, the the snapshot VDI would delete fail in this case because in the old code, I try to delete all the VDIs from the VDI tree, but the base VDI and the cloned VDI have beed deleted, and it causes an error which finally cause snapshot VDI delete fail. This patch is another version of my previous one which is here: http://lists.wpkg.org/pipermail/sheepdog/2012-May/003332.html I simplely reset the delete_error flag, as Kazutaka mentions, it's not good to do this, but we should never try to delete the VDIs already deleted again, so I mark an VDI as deeply deleted by clearing its name and size, which means the VDI and it data objects have already been deleted, then in the next deletion work, we can just ignore these VDIs. If the data objects haven't been deleted, then we just clear its name to mark it as deleted, next time, we can try to delete its data objects. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 9f25ae9d9ca7cb6ba596b1c9dab755b54104a5a6 Author: levin li Date: Mon May 7 13:45:31 2012 +0800 traverse the VDI tree before delete cloned VDI Even if the VDI is a cloned VDI (not the VDI just created by 'clone' operation, but the cloned VDI on the leaf node of the VDI tree), we also should traverse the vdi tree from the root VDI. But if there's a VDI not deleted in the tree path, and the VDI to delete is a cloned VDI, we should delete it's data objects so as to save disk space. If all the VDIs have been deleted in the tree path, then we traverse the path to delete all the objects of the VDIs. I'd like to explain again why we need to deleted the objects created by copy-on-write when we try to delete a cloned VDI, in the old logic, objects of cloned VDI can only be deleted after all the VDI in the tree path has been deleted, but there's a problem, we may clone many VDIs from one snapshot, and these VDIs can be deleted frequently, but we may always not delete the snapshot VDI, so in this case, objects of cloned VDI would always stay in the disk, as we know, they're already useless and should be deleted, and waste too much disk space, but we can only make it deleted after all the VDI in the tree path, including the snapshot VDI have beed deleted, it's really not good. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 1217aee830bcf3c4b202913631c27a35082ea591 Author: Shevek Date: Wed May 16 21:26:35 2012 +0800 sheep: handle master crashing before sending join request This patch fixes one of the cases described by Huxinwei and Liu Yuan where sheepdog fails to elect a master. A longer description is in the patch. A problem arises if a node joins the cluster and generates a confchg event, then crashes or leaves without sending a join request and receiving a join response. The second node to join never becomes master, and the entire cluster hangs. This patch allows a node to detect whether it should promote itself to master after an arbitrary confchg event. Every node except the master creates a blocked JOIN event for every node that joined after itself, therefore the master is the node which has a JOIN event for every node in the members list. Signed-off-by: Shevek Signed-off-by: Liu Yuan commit 7b29ee83c9e6638d09bc7c9f4acd4d9ae0c5d199 Author: Yunkai Zhang Date: Wed May 16 17:04:44 2012 +0800 sheep: delete useless get_vdi_bitmap_from_sd_list() In __sd_join(), get_vdi_bitmap_from_sd_list() do a lot of duplicated works with following for loop as w->member_list contains all nodes in sys->nodes. So we can drop get_vdi_bitmap_from_sd_list() safely, it can make __sd_join() looks more cleanly. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 23aa5d106fdebfbe8482f13b92ec31a97c06ce6d Author: Christoph Hellwig Date: Wed May 16 02:58:58 2012 -0400 sheep: call do_cluster_request from the main thread There are two problems with calling do_cluster_request from a work queue: 1) sys->pending_list is expected to only be used from the main thread and does not have any locking 2) the ->notify cluster driver metho is expected to be called from the main thread Simplify call do_cluster_request directly from process_request_queue instead of offloading it to a workqueue to fix this, and document the assumptions in the code. Based on an earlier patch from Yunkai Zhang . Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 7280771b391faaf2b1cbb49db9f2b14225defdf5 Author: Sebastian Wiedenroth Date: Wed May 16 09:52:48 2012 +0200 define constant for uint64 decimal string representation Signed-off-by: Sebastian Wiedenroth Signed-off-by: Liu Yuan commit cc79ff50716bee66c365b8fb9e62a83428227f9b Author: Liu Yuan Date: Wed May 16 17:18:31 2012 +0800 object cache: use bitops for bits manipulation. Signed-off-by: Liu Yuan commit 360f1541d4f5e7a6b39d8f1707d714c9644d059a Author: Liu Yuan Date: Wed May 16 17:18:27 2012 +0800 add fls, fls64 operation for bit manipulation Signed-off-by: Liu Yuan commit 2df7550fe04c418bdcf78d18b580b345d2503353 Author: levin li Date: Tue May 15 18:28:23 2012 +0800 fix a bug in the binary search in get_vnode_pos() Consider the corner case that id is less than the id of the first entry in entry list. Signed-off-by: levin li Signed-off-by: Liu Yuan commit d81db818d10ff398e2831ff5fe7046495564c9ea Author: Christoph Hellwig Date: Tue May 15 12:54:47 2012 -0400 collie: allow explicit read sizes that are not a multiple of 512 Collie vdi read already does can handle read sizes smaller than a block, which is used when reading data to stdout without an explicit size argument. Also allow using this code when an explicit size argument is given. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit c6cb13432383229b1f5c22acb71920cd846a06b5 Author: Liu Yuan Date: Wed May 16 10:38:51 2012 +0800 remove annoying *.orig and *.rej for 'make clean' Signed-off-by: Liu Yuan commit 6fc0f1e6fa125106461c8920864f2897d400a7db Author: Sebastian Wiedenroth Date: Tue May 15 11:36:17 2012 +0200 use large enough strings for raw output of uint64 sizes Signed-off-by: Sebastian Wiedenroth Signed-off-by: Liu Yuan commit 17d73541b0ca5398396191c12bdd7aa3612107e2 Author: Yunkai Zhang Date: Tue May 15 17:25:04 2012 +0800 object cache: make the flushing data more fine grained Each time when sheep flushes an cache object, it will flush its whole data(4M) to the cluster, even if that object only contains one byte in dirty. Now I splits an cache object into multiple blocks, each block is 128 KB (this size can be switch in range[64, 128, 256, 512] KB defined by BLOCK_SIZE macro). Each time when client writes an cache object, sheep will caculate which block would be modified and mark it as dirty block. Those dirty infomation are saved in a bitmap. When sheep flushes an canch object, it will not flush the whole data again, instead of, it will read the bitmap of this object and only send the data marked as dirty block. In addition, I replaced flock with lockf so that we only lock on the section we care about of an object file. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 69b312394a4d77141c4b016bdbc85f351230dbe5 Author: Christoph Hellwig Date: Tue May 15 04:57:08 2012 -0400 sheep: use need_consistency_check consistently Move all remaining checks if we need a consistency check into need_consistency_check, which cleans up process_request_queue, and reduces the number of calls to object_is_cached which is more expensive than the other calls. Pass the request directly to need_consistency_check and set_consistency_check and thus avoid having to deal with the object header inside process_request_queue. Also remove the unused local copies variable while we're at it. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit fe9f841f8aeeec71b421329795e53c75af4af0ac Author: Christoph Hellwig Date: Tue May 15 04:53:58 2012 -0400 cleanup obj_to_sheep(s) a bit more Move the call to fnv_64a_buf into get_vnode_pos, and remove the unreachable return statement at the end of get_vnode_pos. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit eb433046c46b6a69837678cb25032b5ae3038f53 Author: Christoph Hellwig Date: Tue May 15 04:53:09 2012 -0400 sheep: call req_done directly Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 52adc3bdf69bde4ae5a4461a2bd36453468090a5 Author: Christoph Hellwig Date: Tue May 15 04:38:10 2012 -0400 collie: allow explicit write sizes that are not a multiple of 512 Collie vdi write already does an internal read-modify-write cycles for write sizes that aren't multiples of the block size, which is used when writing data from stdin without an explicit size argument. Also allow using this code when an explicit size argument is given. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 4fb3e1afb68797d8ba1ee50ede3ae2bedbeed147 Author: levin li Date: Tue May 15 14:48:54 2012 +0800 use binary search in hval_to_sheep() As we know, binary search is much faster than sequential search, we can use binary search to make hval_to_sheep() faster. remove hval_to_sheep() and hval_to_sheeps() to make the code cleaner. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 19604871136e3c1355b6f817153f1eebe0b435c1 Author: levin li Date: Tue May 15 14:48:53 2012 +0800 add oid_to_vnodes() and obj_to_sheeps() to avoid too much vnodes traverse oid_to_vnode() and obj_to_sheep() traverse the vnode list to find the target vnode, many times, sheep needs to call these two functions nr_copies times, it means we also need to traverse the vnode list nr_copies times, it's absolutely a waste, so I add these two functions to make it only traverse one time in stead of calling obj_to_sheep()/oid_to_vnode() nr_copies time. Signed-off-by: levin li Signed-off-by: Liu Yuan commit f52b45d4eede375ca25b0bb79efe9f517b2fc278 Author: Christoph Hellwig Date: Mon May 14 11:43:37 2012 -0400 sheep: call check_request from resume_pending_requests Recovery for all objects isn't nessecarily done when we call resume_pending_requests, so check the request again. The following test case: sheep -l7 -d /tmp/sheep/7000 -p 7000 collie cluster format --copies=1 -p 7000 collie vdi create 'test-vdi' 300M -p 7000 dd if=/dev/zero count=300M | collie vdi write test-vdi -p 7000 collie vdi read test-vdi 0 1M -p 7000 > /dev/null sheep -l7 -d /tmp/sheep/7001 -p 7001 collie vdi read test-vdi 0 300M -p 7001 > /dev/null fails without this patch with errors like: Failed to read object 7be7f900000003 No object found Failed to read VDI Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 2ba37b36b9073e70f3703d7d97d162bb90cd080e Author: Christoph Hellwig Date: Fri May 11 09:19:05 2012 -0400 sheep: cleanup request list handling Remove the embedded cevent in struct request and use a single list for the requests that are under I/O, delayed and completed. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit a5711313b49c98c35539592a481e1d53f4df0ac0 Author: Shevek Date: Tue May 15 13:29:36 2012 +0800 Replace calls to node_cmp with a typesafe node_eq Make node_eq typesafe, rather than accepting void *, since C99 will allow us to accidentally cast any type pointer to a void * without warning. Signed-off-by: Shevek Signed-off-by: Liu Yuan commit cb049cc9184c6b0132741fb117a7cb6e3bcd23c4 Author: Christoph Hellwig Date: Thu May 10 10:48:11 2012 -0400 don't check for liburcu pkg-config files For now we only use the uatomic.h header from liburcu and don't need to link against the actual library, so we don't bother with the PKG_CHECK_MODULES check. If an actual dependency is introduced it will need to be added back, including a non-pkgconfig check for older distributions Like Debian -testing where the current check fails. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 9beb630f0c75d628e5950d7d6992fbbd0a462eab Author: Christoph Hellwig Date: Thu May 10 10:47:45 2012 -0400 fix autoconf warnings It seems like newer autoconf versions need code that is test compiled to be enclosed by a AC_LANG_SOURCE macro. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 844706a38c346d7400d0641cf88722f8a7ce6d9c Author: Yunkai Zhang Date: Thu May 10 17:15:10 2012 +0800 sheep: dprintf connection info in connect_to() We have dprintfed the connection info which contains client's IP:PORT in destroy_client(). Now, I dprintf the server's IP:PORT when sheep calls connect_to(). These IP:PORT pair info can help me to analysis network problem. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit e959447062dd85982acb2b9a9bf7f1432f76c90c Author: HaiTing Yao Date: Thu May 10 17:17:22 2012 +0800 sheep: need not read many copies of bitmap Node maybe need read all of the bitmaps when the epoch is not zero. This perhaps occurs when we restart the shutdown cluster or join one once existed node. Usually, node need not read many copies of bitmap. If the cluster status is OK and the node status is waiting for format, node just need read one bitamp. Signed-off-by: HaiTing Yao Signed-off-by: Liu Yuan commit 75face7f9a0cf49eb16cb7831e46e1f022d7ffd0 Author: Shevek Date: Wed May 9 18:40:00 2012 -0700 PATCH S007: Fix two memory leaks ==7463== 72 bytes in 1 blocks are definitely lost in loss record 11 of 38 ==7463== at 0x4C279F2: calloc (vg_replace_malloc.c:467) ==7463== by 0x404423: zalloc (util.h:58) ==7463== by 0x408090: sd_join_handler (group.c:1181) ==7463== by 0x41BFA1: __corosync_dispatch_one (corosync.c:496) ==7463== by 0x41C421: __corosync_dispatch (corosync.c:582) ==7463== by 0x41CC8D: cdrv_cpg_deliver (corosync.c:741) ==7463== by 0x504E978: cpg_dispatch (cpg.c:390) ==7463== by 0x41D646: corosync_dispatch (corosync.c:981) ==7463== by 0x4053FB: group_handler (group.c:283) ==7463== by 0x421DA3: event_loop (event.c:181) ==7463== by 0x40425F: main (sheep.c:272) ==7463== ==7463== 544 bytes in 1 blocks are definitely lost in loss record 29 of 38 ==7463== at 0x4C28F9F: malloc (vg_replace_malloc.c:236) ==7463== by 0x424004: xmalloc (util.c:36) ==7463== by 0x4240AF: xzalloc (util.c:53) ==7463== by 0x410F78: jrnl_begin (journal.c:186) ==7463== by 0x40DD71: set_cluster_store (store.c:1209) ==7463== by 0x405F13: finish_join (group.c:552) ==7463== by 0x4061C2: update_cluster_info (group.c:598) ==7463== by 0x406F70: __sd_join_done (group.c:845) ==7463== by 0x40753B: event_done (group.c:959) ==7463== by 0x4105D2: bs_thread_request_done (work.c:178) ==7463== by 0x421DA3: event_loop (event.c:181) ==7463== by 0x40425F: main (sheep.c:272) Before these two fixes, I get a lot of lost data. With these two fixes, I get 0 bytes definitely lost. Signed-off-by: Shevek Signed-off-by: Liu Yuan commit 7e94977ef5951deadaa32e4bcca4a33e3727d37b Author: HaiTing Yao Date: Thu May 10 16:49:31 2012 +0800 sheep: update inode cache first When create snapshot, write inode of base VDI internally without care of cache. Then the inode in cache may be wrong from disk Signed-off-by: HaiTing Yao Signed-off-by: MORITA Kazutaka commit f6891ef586a4ab691fb1011650bcb7264fd04549 Author: MORITA Kazutaka Date: Fri May 4 21:44:07 2012 +0900 sheep: add support for gateway mode The gateway nodes only forwards requests to the sheepdog cluster, and doesn't store objects in local at all. Running a gateway daemon on localhost and connecting to it would be an alternative to implementing sheepdog fail-over support in qemu because the gateway node automatically choose the correct target nodes. Currently, this options is just an alias for '--vnodes 0'. Note that a small storage to store epoch information is still required for gateway nodes. Signed-off-by: MORITA Kazutaka commit 59ea93a28062f4ebdf3ebf945531775a29c7028d Author: Liu Yuan Date: Tue May 8 20:11:11 2012 +0800 zookeeper: switch gcc atomic builtins to atomic helpers Acked-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 453b5461ba3841bc731225844abc710827b4da03 Author: Liu Yuan Date: Tue May 8 20:10:03 2012 +0800 configure: add urcu into configure It is suppoed to be used by sheepfs. Though sheep doesn't necessarily need this library, its well implemented atomic operation header will be put good use for other sheep code. To use atomic header, just add this in the source: #include And its API used as uatomic_xxx(). We can get the lib dev file by $ sudo apt-get install liburcu-dev or compile it from the git source: git://git.lttng.org/userspace-rcu.git Signed-off-by: Liu Yuan commit 760eba24da37664553d6fb694fc434f68905db90 Author: Liu Yuan Date: Mon May 7 14:09:25 2012 +0800 sheep: init hdr.epoch before check_request() Signed-off-by: Liu Yuan commit 265e6feace832fda9c0c6d95b81acb0f78acd0d7 Author: MORITA Kazutaka Date: Fri May 4 03:38:14 2012 +0900 sheep: avoid calling get_vnode_info() against force operations The force operations can be processed before we set up current_vnode_info. For example, when we start the cluster, sheep daemons can receive SD_OP_READ_VDIS requests from other nodes before setting current_vnode_info. So the force operations shouldn't access req->vnodes in their process_work() and process_main(). Signed-off-by: MORITA Kazutaka commit cdeabfce2966470791e68337f533a8f13eaf8dc5 Author: Yunkai Zhang Date: Sun May 6 05:36:55 2012 +0800 Fix script/Makefile.am after removed old test file Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit e9f052e5edb5380414ff5bcef5d0a96dbadda1ff Author: MORITA Kazutaka Date: Fri May 4 05:14:14 2012 +0900 remove old test file Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit c81e3f7e176c46282f4f349cde00e565d6b8b2c3 Author: Liu Yuan Date: Thu May 3 18:54:32 2012 +0800 script: add checkpatch.pl Since we advocate Linux kernel coding style, it would be convenient to use its test script too. usage: script/checkpatch.pl path-to-your-patch NOTE: no need to pass --no-tree, I have hacked it on as default option. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit a0b0d9273d843789272bf7adb856aaa9c1b5ae79 Author: Liu Yuan Date: Thu May 3 12:01:18 2012 +0800 sheep: cleanup get_latest_epoch() return uint32_t instead of int Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 310b974beb818d57822d1e862b94d90f0d7cf457 Author: levin li Date: Thu May 3 18:25:46 2012 +0800 fix a bug of copies calculation in delete_one() It tries to compare the nr_copies with inode->nr_copies, but the inode has just been allocated, the nr_copies may be zero or something random Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit b50e9ae8fb8728b1ac387b7431f13f5df0a7dd84 Author: levin li Date: Thu May 3 18:25:44 2012 +0800 remove useless inode in del_vdi() The inode defined and allocated in del_vdi() is no longer used, we should remove the useless code. Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 541d0fc3d5fde29be1a1cc3dbca8674bc50f2c40 Author: Liu Yuan Date: Wed May 2 11:22:05 2012 +0800 object cache: fix create_cache_object() We should unlock fd before err-out. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 893d38bec4ed10fb7dbd81457560991ed0c66c02 Author: Liu Yuan Date: Wed May 2 11:59:33 2012 +0800 sheep: cleanup epoch type We have defined 'epoch' as uint32_t, but many code use it as int, so fix this inconsistency. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 946de3b29821ba33ec678e79d708ea9d28f501cd Author: Liu Yuan Date: Wed May 2 11:22:03 2012 +0800 sheep: clean up is_access_to_busy_objects() We don't need check oid == 0 or opcode & flags, because this is always guanranteed by the caller. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 1c288cdbbcb0bad933aeff5938ff6a9bab711638 Author: levin li Date: Wed May 2 17:16:56 2012 +0800 remove unused vnodes_info in start_deletion() I noticed there's a vnodes_info defined and set to NULL, but never used except at last by put_vnodes_info(), so I think we should remove it. Signed-off-by: levin li Signed-off-by: MORITA Kazutaka commit 0a602bc1cd5de6f3b627897847294a6d158bc773 Author: levin li Date: Tue May 1 21:26:22 2012 +0800 fix a bug in get_store_dir() which gives a bad directory for simple store For simple store, it gives a directory for the object such as /home/levin/disk/store/2/obj/00000001007c2b2500000002 but in fact it should be /home/levin/disk/store/2/obj/00000001/007c2b2500000002 This bug may cause sheep fail to remove data objects Signed-off-by: levin li Signed-off-by: Liu Yuan commit 435a14381c14aacd73718b27f916ab2c1eac37b9 Author: Liu Yuan Date: Wed May 2 10:52:00 2012 +0800 sheep: bypass check_request() for cached object if we go for a cached object, we don't care if it is busy or being recovered. Signed-off-by: Liu Yuan commit 099b97122991fafe6f395837a6f114a07745a12f Author: Liu Yuan Date: Wed May 2 10:51:45 2012 +0800 sheep: fix io_op in queue_request() We should call setup_access_to_local_objects() before check_request(), otherwise check_request() will be always a NULL operation. - check_request() only works for io_op, so move it into if clause. Signed-off-by: Liu Yuan commit 064d101e4b454905ab4dac6aceac961c2ad25223 Author: Liu Yuan Date: Wed May 2 10:51:37 2012 +0800 sheep: clean up client_rx_handler() info It is really annoying to see below redundant debug info for only one queue_request(). Let's have a 1:1 mapping for this. May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 client_rx_handler(406) connection from: ::1:35638 May 01 17:11:12 queue_request(263) 3 Signed-off-by: Liu Yuan commit 55820c5d5b44245ea1ec91691bdc6c51b558036e Author: Liu Yuan Date: Tue May 1 16:13:38 2012 +0800 sheep: get rid of gcc warning "ops.c:294: warning: declaration of ‘ctime’ shadows a global declaration /usr/include/time.h:258: warning: shadowed declaration is here" Signed-off-by: Liu Yuan commit 888de5ddd2ac2c107cbac1454f676917f5eb8d68 Author: Liu Yuan Date: Tue May 1 16:13:37 2012 +0800 object list cache: move it out of store.c Signed-off-by: Liu Yuan commit f29b105a2015afad0da8561183978263e56368e3 Author: Liu Yuan Date: Tue May 1 02:34:41 2012 +0800 sheep: move store_* operations into ops.c - move stat_sheep() into ops.c too Signed-off-by: Liu Yuan commit 610035b2b4e8163a083917e626c608f0d0baa247 Author: Liu Yuan Date: Tue May 1 02:34:18 2012 +0800 sheep: move read/write/remove_object() into store.c - thus we can make read/write_object_local() as local static functions. Signed-off-by: Liu Yuan commit 59fe234ea2e2059ad79ece4a62f24c10baf30e32 Author: Liu Yuan Date: Sat Apr 28 21:35:49 2012 +0800 object_cache: fix a race condition create_cache_object() need to get a lock to avoid race with object_cache_rw(). This is to address below bug in the log: Apr 28 17:48:12 do_io_request(931) 2, 47c33f00003f80 , 1 Apr 28 17:48:12 object_cache_rw(321) 00003f80, len 4096, off 1396736 Apr 28 17:48:12 client_rx_handler(427) HEAD, fd:280 Apr 28 17:48:12 client_rx_handler(432) DATA_INIT, fd:280 Apr 28 17:48:12 client_rx_handler(470) END, fd:280 Apr 28 17:48:12 queue_request(275) 2 Apr 28 17:48:12 do_io_request(931) 2, b1f0b000003f9b , 1 Apr 28 17:48:12 object_cache_pull(423) [local] 00003f9b Apr 28 17:48:12 object_cache_pull(423) [local] 00003f9b Apr 28 17:48:12 create_cache_object(352) 00003f9b already created Apr 28 17:48:12 object_cache_rw(321) 00003f9b, len 16384, off 2940928 Apr 28 17:48:12 read_cache_object(307) size =0; count= 16384 Apr 28 17:48:12 do_io_request(956) failed: 2, b1f0b000003f9b , 1, 3 Apr 28 17:48:12 io_op_done(151) leaving sheepdog cluster Signed-off-by: Liu Yuan commit 61e8b0d9f6c07d6c1af0a90d17db7dee51839fa5 Author: Liu Yuan Date: Tue May 1 02:33:58 2012 +0800 farm: refactor trunk_file_write() Since we already have a dedicated funciton trunk_file_write_recovery() for system recovery handling, we don't have 'user' flag for trunk_file_write() and simply rename it as trunk_file_write_user() Signed-off-by: Liu Yuan commit 9741800758c6b2f3f9ba27dadef1802a0e538353 Author: Liu Yuan Date: Tue May 1 02:32:58 2012 +0800 farm: clean up farm_end_recover() - print what the epoch we reach instead of 'iocb->epoch - 1' - use epoch instead of 'iocb->epoch - 1' Signed-off-by: Liu Yuan commit 24ace35127b3aed065404e9044b0d6b66a60ea48 Author: Liu Yuan Date: Tue May 1 02:30:58 2012 +0800 sheep: refactor is_access_local() - let is_access_local to calculate copies itself - fix a nasty bug in void io_op_done(), we should not pass 0 or leave_cluster() will never be called. Reviewed-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 579109780c8ce9a35cf495968d1dfdabc8d9ac63 Author: Christoph Hellwig Date: Sun Apr 29 09:32:30 2012 -0400 logger: fix pid confusion We need to store the logger (child) pid in a variable as getppid() fails inside the death handler. Instead of going back to using the pid variable like it was done before "logger: improve death messages" use two different variable for the sheep and logger, and comment on why we perform this confusing dance, so that it doesn't get accidentally removed again. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 91f78aae93dd010dda2edd6975cd1dd3e9557f54 Author: Yunkai Zhang Date: Sat Apr 28 16:07:20 2012 +0800 Make connection information more readable Add ipstr and port in connection structure, and fill these values in create_client(). We can dprintf them when we need to show us where the client connection comes from. To help us debuging, I dprintf these info in client_rx_handler() and destroy_client(). Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 07e095a515ff81ab082aa151050aaf14536202c6 Author: Liu Yuan Date: Tue May 1 02:31:03 2012 +0800 sheep: fix get_nr_copies() We should never return anything greater than sys->nr_copies. - use min() to get the better readability - change sys->nr_copies as 'int' to stay in line with nr_zones Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 8567aae281c75502c0a267bf76b771a2af8392f2 Author: Yunkai Zhang Date: Tue May 1 02:51:24 2012 +0900 Remove zk_lock from zk_join Use ip address and port number to sort member list instead of sequence number from /sheepdog/queue. Benefit from these changes, the order of member list no longer depends on the order of joining, then need not to take following steps into one transaction: - get sequence number from /sheepdog/queue - create znode in /sheepdog/member/ - send join message to cluster As a result, we can remove lock from zk_join, and move the creation of znode in /sheepdog/member/ into zk_dispatch. I use binary tree to store member list so that it can make the sort more quickly. Methods with node_btree_xxx prefix are used to operate this binary tree. ==*Note*==: When sheep startups, it fetchs member list by reading /sheepdog/member/*, if the result is empty, sheep will consider itself as *master*. Now we have removed lock from zk_join,if we start multiple sheeps simultaneously, one problem arises: there may exist more than one *master*, this is bad. To prevent this problem, we can start multiple sheeps like this: - start the fist sheep alone, and sleep 2 seconds: $ sheep -d /store/0 -z 0 -p 7000 -c zookeeper:localhost:2181 $ sleep 2 - start other sheeps simultaneously(need not to sleep between them): $ for i in {1..100}; do sheep -d /store/$i -z $i -p $((7000 + $i)) \ -c zookeeper:localhost:2181 Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit b5e8a8e196f724c670c17d1ff5381b225f1c22b7 Author: Yunkai Zhang Date: Thu Apr 26 23:21:29 2012 +0800 Fix bug: leave event lost in zookeeper driver If one sheep joined and left too quickly, it may lead to leave event lost. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 0b256be4fc2c0c7a2821f04808e5740f53ed2e04 Author: Yunkai Zhang Date: Thu Apr 26 23:21:28 2012 +0800 Fix bug: zk_leave doesn't work Now we just need to delete znode in /sheepdog/member, and zookeeper will notify LEAVE event to all other sheeps automatically. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 1b0f1e64b881edca9c9af5b518d40e087715bf77 Author: Yunkai Zhang Date: Thu Apr 26 23:21:27 2012 +0800 Add code to handle sequence number overflow The range of sequence number in zookeeper is: [-2^31 ~ (2^31-1)], We store it in queue_pos variable, and use queue_pos++ to refer next znode in /sheepdog/queue. When sheep creates the first znode in /sheepdog/queue, zookeeper initials the sequence number with 0. Each time we create a new znode, it's value plus 1. So it will overflow and become a negative in the future. In fact, it's value just like a circle. This patch try to handle this problem. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 79d71648b3dbf1d7a268633a5696e926f0bfd3d2 Author: Yunkai Zhang Date: Thu Apr 26 23:21:26 2012 +0800 If previous zookeeper session exists, shutdown sheep Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 34adf5b202f4e0028cb5787de5f2fc48b20a7e06 Author: Yunkai Zhang Date: Thu Apr 26 23:21:25 2012 +0800 Rewatch znode in /sheepdog/member after it changed If we not rewatch znode in /sheepdog/member after it changed, we will lose all events of it in the future, such as DELETE event. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit b77142ea7f093daa3d3ba8640a84801dfe19ddd6 Author: Yunkai Zhang Date: Thu Apr 26 23:21:24 2012 +0800 Use atomic builtins to replace pthread mutex locks There were two pthread mutex locks in zookeeper dirver: 1. pthread mutex lock in zk_dispatch/zk_lock, this lock was used to prevent zk_lock poping data before zk_dispatch have pushed it. 2. pthread mutex lock in zk_queue_pop/add_event, this lock was used to protected leave event list. This patch use GNU atomic builtins(__sync_xxx) to replace these two pthread mutex locks. It makes zookeeper driver more lighter and and faster. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit 60d59b1288afd8f0ec76bd596053c2e9c3000301 Author: Yunkai Zhang Date: Thu Apr 26 23:21:23 2012 +0800 Fix two bug: 1. Change pthread lock in zk_dispatch/zk_block so that it will not block IO request between sheep. 2. If one node pointed to by queue_pos was send by leaver, and it have blocked whole cluster, we should ignore it. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit a4c6a05a37c58b68cc3d7d5fbf11216264e17aad Author: Yunkai Zhang Date: Thu Apr 26 23:21:22 2012 +0800 retry again when zoo_* api return ZCONNECTIONLOSS/ZOPERATIONTIMEOUT error Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit d36044d5d22705cc85916a6b5331cfe5f98c0bbd Author: Yunkai Zhang Date: Thu Apr 26 23:21:21 2012 +0800 Optimize the size of buffer send to zookeeper We send the whole buf[MAX_EVENT_BUF_SIZE] to zookeeper regardless the real content is so small. Now, we only send the real content in the buffer, not waste anymore. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit cc60ac6259b3c834d343491586427158b555bd55 Author: Yunkai Zhang Date: Thu Apr 26 23:21:20 2012 +0800 Refactor zookeeper driver This patch tries to refactor zookeeper driver. It have changed so much code that it's difficult to explain how it work in detail. There were several problems in the old zookeeper driver: 1. Each zookeeper message contains whole member list, it make network become heavy and slow. 2. Before sending each message, get_nodes function was call. We used this function to get the number of members by comparing each znode in /sheepdog/queue. This function will become more and more slower as znodes increasing in /sheepdog/queue. 3. The processing method of LEAVE EVENT is too lengthy. When a sheep droped from the cluster, zookeeper will remove the corresponding znodes in /sheepdog/member automatically, and each sheep will be notified by the registered watcher fcuntion. Normally, sheep should process this leave event when it got it, but it didn't, it created a new leave event message and send it to /sheepdog/queue, untill all sheep received this new message again. 4. There are so many unnecessary zookeeper locks. These locks greatly reduce zookeeper driver's performance. This patch tries to fix all problems memtioned above: 1. Reduce message size, remove member list from message content and save it in two places: a) In /sheepdog/member. When a sheep joins into cluster, it put it's information into /sheepdog/member/, and get other sheep's information from there. b) In eache sheep's private memory. Each sheep will maintain an array to contain the newest member list so that it can access this content as much as quickly, and needn't to call get_nodes function when it send a new message. 2. Optimize the processing method of LEAVE EVENT. Now, sheep will process leave event once it received it. 3. Remove unncecessary zookeeper locks. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit bb45efe5999950ff70dc2c422c7069a472e4ff10 Author: Christoph Hellwig Date: Fri Apr 27 11:35:15 2012 -0400 factor a finish_join helper out of update_cluster_info Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 38ec7ed38295f15405f7ed56ab34a11f08ba7a3f Author: Christoph Hellwig Date: Fri Apr 27 11:34:06 2012 -0400 sheep: split recovery from store.c Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 376598e08edcfacfb8f341b97e4caee6f491f39b Author: Christoph Hellwig Date: Fri Apr 27 10:23:54 2012 -0400 sheep: fix get_zones_nr_from for clusters with gateways Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit d0b2e42da3babf28d76be6d805e5292d0b84311b Author: Christoph Hellwig Date: Fri Apr 27 10:23:53 2012 -0400 sheep: rewrite get_max_copies using get_zones_nr_from Also document it and rename it to get_max_nr_copies_from to avoid confusion with get_nr_copies. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit ba7e579470c0a991a02d69ac6bdf4787079b199e Author: Christoph Hellwig Date: Fri Apr 27 10:23:52 2012 -0400 sheep: rename nr_sobjs to nr_copies nr_sobjs is an extremly confusing name for the number of copies in the cluster, so let's rename it. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit d9eb6e66e4dc1db62e8a4ef7d7c96b40749504e0 Author: Christoph Hellwig Date: Fri Apr 27 10:23:51 2012 -0400 sheep: cleanup nr_copies handling Add a new get_nr_copies helper to consolidate calculating the number of copies of an object that we need to deal with. As a side effect this allows making struct vnode_info private to group.c again. There are a few places that take a different number of copies from the inode or the on the wire header. These look incorrect to me, but I'd like to have some review of those. These places should be either documented or removed ASAP. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit db1795d440ff31ea9f1fbbfc36ce7219f27b97c3 Author: Christoph Hellwig Date: Fri Apr 27 10:23:50 2012 -0400 sheep: add a vnode_is_local helper Add a helper to check if a given vnode is local to shorten the repeated calls to is_myself. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit a47736502cf3593eefc57c1be620cdc04811bb66 Author: Christoph Hellwig Date: Fri Apr 27 10:23:49 2012 -0400 sheep: add a oid_to_vnode helper Abstract away mapping from a oid to a vnode into a new helper instead of calling obj_to_sheep directly. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 42204e66e0961c67420214a40e73536977510f47 Author: Christoph Hellwig Date: Fri Apr 27 10:23:48 2012 -0400 sheep: rewrite the vnode mapping layer Rewrite vnode mapping by making the structure containing the tuple a first class citizen and passing it around to all places that deal with vnodes. This in turn also allows getting rid of the list of these structures, as we can simply create one at each epoch change and reference count it. This also uncovered a nasty bug where queue_request called setup_access_to_local_objects before the vnode-related fields in the request were set up. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 0e1f483533baeaf7b08edfd424bbe715c5f7d632 Author: Christoph Hellwig Date: Fri Apr 27 10:23:47 2012 -0400 sheep: remove setup_ordered_sd_vnode_list There is little value to a function that has two callers, where one of them only needs half of it. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit b40955a34c91a109ddf9ac92519d168b46e1f179 Author: Christoph Hellwig Date: Fri Apr 27 10:23:46 2012 -0400 sheep: cleanup io_op_done Use a switch statement to make checking of the various error returns more readable, use a goto label for the common retry code, and split the data object consistency checking into a function of its own. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit ce14cd45b3f5989766e546a03312dc97b0fa740b Author: Christoph Hellwig Date: Fri Apr 27 10:23:45 2012 -0400 sheep: cleanup is_access_local All callers have a struct request that is used for various arguments, so pass it directly. Also mark the function static given that is has no callers outside of sdnet.c. Signed-off-by: Christoph Hellwig Signed-off-by: Liu Yuan commit 18c221f487297284d938ba66f80dc21d0fa4e254 Author: Yunkai Zhang Date: Fri Apr 27 21:15:39 2012 +0800 sheep: bug in event_done leads to dead lock Dead lock was found in the following scenario: Suppose that there are two sheeps: S1, S2, and their event_queues are empty. Now S1 received a notify message: M1, and call sd_notify_handler() which will add notify event to its event_queue and than call process_request_event_queues() to queue_work this event. At the same time, S2 send a notify message: M2 to cluster and an I/O request(eg. do_lookup_vdi operation) was submitted to S1 when S2 calls zk_dispatch() to handle M2. After S1 received I/O request from S2, it would finally call process_request_event_queues() to deal with this event, if S1 call this function before M1's event_done() finished, this I/O request would not to be processed for the event_queue was not empty. This problem leads to dead lock between S1 and S2, S2 would be blocked in read() waitting for the data responsed by S1, and the whole cluster would be suspended forever. To fix this problem, we just modify the code in event_done, so that it can process request_queue after event_queue is empty. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 0275cefb8acdf5696028e5d994819e417142a4a9 Author: Shevek Date: Fri Apr 27 22:27:30 2012 +0800 Rebracket unregister_event calls in group.c Fixes the pairing of register_event and unregister_event on the cluster driver file descriptor. It now (coincidentally) matches the assignments to event_running. There were several code paths where either a duplicate unregister could happen, or no work would be queued, so the file descriptor would never be re-registered. Signed-off-by: Shevek Signed-off-by: Liu Yuan commit 3f7ed715e3f054a7546b215b4640d497a5fcbbc7 Author: Shevek Date: Fri Apr 27 22:25:10 2012 +0800 Easier to grep for PANIC calls Add PANIC to all panic formats, to make logs easier to grep. All fmt parameters should be string constants, so this concatenation is always valid. Signed-off-by: Shevek Signed-off-by: Liu Yuan commit 229ab9af84831923fb990b7611e2f40071b662ce Author: Shevek Date: Fri Apr 27 22:23:27 2012 +0800 Check before allocating Avoid useless allocation/free in a loop by delaying allocation until we know we need the memory. Signed-off-by: Shevek Signed-off-by: Liu Yuan commit 446ed7552e6883c7a6c839dff82e6ed590bd5369 Author: levin li Date: Fri Apr 27 12:29:06 2012 +0800 make remove_object() returns -1 when fail to delete one of the objects Currently, remove_object() only returns -1 when the last deletion fail, it's not correct, we need to returns -1 when fail to delete any one of the objects Signed-off-by: levin li Signed-off-by: Liu Yuan commit 9477c05ba7a856354280299b368c9b090649257c Author: HaiTing Yao Date: Fri Apr 27 10:27:18 2012 +0800 sheep: if fail to read copy from replication, retry it 1, Retry all of the replication, so will not breaking from the loop 2, Assign the actual returning value, because the EIO maybe lead to node leaving the cluster. Signed-off-by: HaiTing Yao Signed-off-by: Liu Yuan commit 500ec9f07eaebcd069dd3e4ec75cdab4ccf56e21 Author: levin li Date: Thu Apr 26 10:45:31 2012 +0800 change 'write' to 'wr' to avoid declaration shadowing The argument 'write' of get_vdi_attr() shadows a global declaraton in unistd.h which will be involved in my next patch. Signed-off-by: levin li Signed-off-by: Liu Yuan commit ad8c5cf64953a27b143f1aa660207e320ba17454 Author: levin li Date: Mon Apr 23 14:18:06 2012 +0800 always delete data objects when deleting an cloned vdi When deleting a cloned vdi, sheep find the root vdi and then traverse the vdi chain(such as base --> snapshot --> clone) to check wheter there's an undeleted vdi in the chain, if some vdi in the chain isn't deleted, sheep just mark the cloned vdi as deleted by clear its vdi name. But in fact a cloned vdi may created its own objects by copy-on-write, these objects can be deleted when deleting the vdi, so we make the cloned vdi to be deleted as the root vdi, then we can deleting its data objects, in delete_one() we check whether the object belongs to itself to determine whether to delete the object. Signed-off-by: levin li Signed-off-by: Liu Yuan commit dabb5200202f2225e69002c055afba97ee5cf73a Author: levin li Date: Mon Apr 23 14:09:38 2012 +0800 deleting data objects of a vdi before deleting the inode Currently, when deleting a vdi, sheep firstly clears the name in the vdi inode to mark it as deleted, then try to delete the data objects, there's a problem, if deleting one or more data objects fails, we could never delete the object any more, because the inode has been deleted. Now I exchange the order of deleting inode and data objects, if deleting some data object successes, we clear the the correlative flag in inode->data_vdi_id[], orelse we set the dw->delete_error to tell that an error occurs in the deletion work. In delete_one_done(), if dw->delete_error is true, we set inode->vdi_size to 0 to show this vdi has been deleted, but fail to delete some of its objects, then we can try to delete the vdi again. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 453072667ce08c6613d627b8ed0ac4f33e6e8dbe Author: Christoph Hellwig Date: Wed Apr 25 12:55:12 2012 -0400 sheep: fix error handling during store intialization Catch corrupted strings in the config file for the store name, and error out if the store name isn't valid instead of continuing. Also factor the store initialization into a separate function to clean the code up a bit. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 4ae3eb8987da8682517280102ccc1ba58cd8fc5a Author: Christoph Hellwig Date: Wed Apr 25 12:55:11 2012 -0400 sheep: mark the store name as plain character pointer The missing cases also show that we weren't propagating the const attribute down to the journal layer, so fix that as well. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit e862b4458410483511c98aaeec668add4eb78961 Author: Christoph Hellwig Date: Wed Apr 25 12:55:10 2012 -0400 sheep: move extern declarations for global variables to sheep_priv.h Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit ff49040495c8556859079d476aa35613190017eb Author: Christoph Hellwig Date: Wed Apr 25 12:55:09 2012 -0400 collie: handle parse_vdi failures Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit ec5efc1559b8c84e4e3406372e1a29d951562d41 Author: Christoph Hellwig Date: Wed Apr 25 12:55:08 2012 -0400 sheep: check for init_work_queue failures Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit b99e7b559052774a7246e76e5ca73b09bc649f7f Author: Christoph Hellwig Date: Wed Apr 25 12:55:07 2012 -0400 sheep: check for register_event errors We don't have a better way to panic / exit, but at least this allows propagation upwards for better handling later, and in the case of event_fn cleanly exits the node from the cluster by dieing instead of causing potentially more harm when going on. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit e16161b9d19eb91fb09fc010e4311be10fc82387 Author: Christoph Hellwig Date: Wed Apr 25 12:55:06 2012 -0400 sheep: remove write only variables in read_object_local Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit a11e385107a185e1f031f9f91f881b4dd8688fad Author: Christoph Hellwig Date: Wed Apr 25 12:55:05 2012 -0400 sheep: fix error handling when reading the journal end mark Handle errors or short reads from pread when looking for the journal end mark. To simplify the code flow also remove jrnl_has_end_mark and IS_END_MARK_SET in favour of inlining them into jrnl_recover. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 3d5e71f9e54e8488eca6933c924082c02040f402 Author: Yunkai Zhang Date: Wed Apr 25 17:23:58 2012 +0800 sheep: fix epoll_wait infinite loop Add code to handle EPOLLERR and EPOLLHUP events in client_handler, otherwise epoll_wait would run into infinite loop when these events occur. Signed-off-by: Yunkai Zhang Signed-off-by: MORITA Kazutaka commit cea5e890dc2e5fefa14f1f7a0cde59fae2e0e134 Author: hch@infradead.org Date: Wed Apr 25 03:03:02 2012 -0400 sheep: remove cdrv_handlers and check_join_cb Instead of obscuring the callbacks into sheepdog from the cluster drivers by various means of callbacks just call them directly like you would do in normal C code. The block_cb callback to the notify routine is left for now as that area needs broader attention later. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 2cd61a4f444ea462f8f7e8c71179a5d704adcd08 Author: Christoph Hellwig Date: Wed Apr 25 02:59:29 2012 -0400 sheep: do not automatically log to syslog if run in foreground Add a new --stdout option to explicitly chose logging to stdout instead of automatically selecting it when running in foreground. Many daemons like upstart, supervisord or systemd prefer programs to not daemonize, but don't nessecarily handle logging to stdout very well. To support these better allow running in foreground while still logging to syslog. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 5302e1db741351ba5ddf85b9b8d2120a34ca8d40 Author: Christoph Hellwig Date: Wed Apr 25 02:59:14 2012 -0400 logger: cleanup dolog Remove the write only ts variable, move ops into the minimal required scope and break lines after 80 characters. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit cc5e7ee93a67acc535af3659e9664f9ef381cbf6 Author: Christoph Hellwig Date: Wed Apr 25 02:58:53 2012 -0400 logger: improve death messages Make the fact more clear that a SIGHUP to the logger means that the sheep daemon died a horrible dead. Also improve the log messag for a logger death a little, and rename log_sigexit so the name describes its function better. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 7e0033c801a866a43cd68598e9605e8651720fb6 Author: levin li Date: Wed Apr 25 11:29:33 2012 +0800 collie: add an operation to show nodes in recovery It's useful to show nodes in recovery when we debuging or maintaining the cluster, many times we want to know which nodes are in recovery, before this, we check the logs to find out whether the cluster has ended recovery, but with this patch, it's easier. usage: #collie node recovery output: Nodes In Recovery: Id Host:Port V-Nodes Zone 4 127.0.0.1:7006 64 6 5 127.0.0.1:7007 64 7 8 127.0.0.1:7010 64 10 9 127.0.0.1:7011 64 11 10 127.0.0.1:7012 64 12 Signed-off-by: levin li Signed-off-by: Liu Yuan commit 9f9e7e91fcb001f4a96fa37b2918e0c7aa15bf03 Author: levin li Date: Tue Apr 24 15:59:41 2012 +0800 fix a bug in rx() when read() returns 0 but connection is not marked as closed. if read() returns 0,it means connection closed, but previous read() may set errno to EAGAIN, in which case rx() would just return 0 without set connection state as C_IO_CLOSED Signed-off-by: levin li Signed-off-by: Liu Yuan commit 225f6d84713b82603e6a4e34aeb4fccceb07e5df Author: Liu Yuan Date: Fri Apr 20 12:26:26 2012 +0800 store: use hex to printf error code Signed-off-by: Liu Yuan commit 4f2ec717df8555fcabebba1bb80160d0bab4f7d3 Author: Liu Yuan Date: Wed Apr 18 10:43:19 2012 +0800 sheep: retry read other copy in failure We have strong consistency for writing objects, so we can read any copy for data. In recovery stage, read cluser might fail and we should retry it. - ask fix_object_consistency() to CREATE instead of WRITE in the case of object not found. Signed-off-by: Liu Yuan commit 610187a65fc345d7e27485cc5d0dc9de16cc088b Author: levin li Date: Tue Apr 17 14:51:22 2012 +0800 remove object oids from object list cache when deleting a vdi When deleting a vdi, sheep removes data objects of that vdi, in which case we need to remove the object ids from the object list cache for the good of next data recovery. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 6e67178e55ae010e3a0d346de07c9872838aaa55 Author: levin li Date: Tue Apr 17 14:09:25 2012 +0800 show vdi tag and clone mark in the output of 'collie vdi list' It's useful to show the tag of every vdi, and the clone mark makes it easy to know whether the vdi is cloned from other. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 1a0ee6bd2ce9ee0e7956019773aaed27167f960e Author: Liu Yuan Date: Mon Apr 16 22:37:59 2012 +0800 farm: refine the read_working_object() output When the object is not found in working direcotry, it not an error case, we should retrofit the debug message to be more friendly read_working_object(362) failed to open /home/tailai.ly/shee \ pdog/store/0/obj/00fd34af00000015: No such file or directory | V read_working_object(362) object fd34af00000015 not found Signed-off-by: Liu Yuan commit fdf4768749778bc944034bf247227ee2f0dfe072 Author: Liu Yuan Date: Mon Apr 16 22:37:48 2012 +0800 object cache: fix race in create_cache_object() Object cache layer allows concurrent RWs on the same oid to achieve the best performance. We should check if object is already created or we will get an EIO from object_cache_rw() in the race condition. Signed-off-by: Liu Yuan commit be4096103f6883924c09bceb868f143c29ab5512 Author: Liu Yuan Date: Mon Apr 16 22:02:08 2012 +0800 object cache: fix cow write We should bypass object cache for COW object writing, because we simply shouldn't create COW object without fetching it first from the cluster storage Signed-off-by: Liu Yuan commit f9a32ddafec9863b4b3addc948bc0f6f8ca78008 Author: Liu Yuan Date: Fri Apr 13 21:54:14 2012 +0800 sheep: fix read_copy_from_cluster() We should pass nr_vnodes to the obj_to_sheep() - rename read_copy_from_replica to get better consistent naming Signed-off-by: Liu Yuan commit 1b578e87d5e674838cdbbe8f49ea270dd6778703 Author: Yunkai Zhang Date: Mon Apr 16 11:41:56 2012 +0800 object cache: incorrect lock may lead to update lost Currently, we use a dirty_rb tree to record which cache object have been updated. Two kinds of threads will operate this dirty tree concurrently: a. mulitple io-workers: write something to cache objects b. one flush-worker: flush all updates to cluster In the following scenario, update will be lost: flush-worker one io-worker [object_cache_push] [object_cache_rw] |-(1) get a cache object from dirty tree | |-(2) read the data file of this object | | modify data file of this object (3)-| |-(4) forward_write_obj_req() | | add this object to dirty tree (5)-| |-(6) rb_erase: remove this object from dirty tree | Note: io-worker generate *new update* in step (3), but flush-worker remove this cache object from dirty tree in step (6). I use two dirty trees to fix this bug and avoid heavy lock between flush-worker and io-wroker threads. There is only one *active* dirty tree for io-workers in any time. After io-worker modify something, it operate this active dirty tree. When flush-worker want to flush all updates to cluster, it: 1. mark another tree as *active* dirty tree. 2. get update info from *inactive* dirty tree. 3. if something wrong occur in flushing process, merge two dirty trees into active drity tree. Signed-off-by: Yunkai Zhang Signed-off-by: Liu Yuan commit 8513a5970a2be2c9e07f976e4fcb9f621e056b18 Author: levin li Date: Fri Apr 13 15:37:36 2012 +0800 collie: add an operation to track an object's locations at each epoch When I was debuging the recovery issue(commit: 470533b), I found it conveninet to have such a tool to track an object's trace at each epoch to check whether an object is placed at the right place, so I add this operation for collie at such case. usage: $collie/collie vdi track debian -i 3 output: obj b1f0b000000003 locations at epoch 1, copies = 3 --------------------------------------------------- 127.0.0.1:7004 127.0.0.1:7000 127.0.0.1:7001 obj b1f0b000000003 locations at epoch 2, copies = 3 --------------------------------------------------- 127.0.0.1:7007 127.0.0.1:7004 127.0.0.1:7000 obj b1f0b000000003 locations at epoch 3, copies = 3 --------------------------------------------------- 127.0.0.1:7007 127.0.0.1:7008 127.0.0.1:7004 Signed-off-by: levin li Signed-off-by: Liu Yuan commit 8fdff424ef2b5f4c4e2d892443bcdadc2e35a022 Author: levin li Date: Fri Apr 13 11:45:01 2012 +0800 collie: show nodes at all epoches for collie cluster info When we check the cluster status using 'collie cluster info', it can only output the nodes at the latest eight epoch at most, It's better to output as many as possible, as it's good for us to check the cluster status. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 62e5b738b2cf4d999a452fb083b03073befb051f Author: levin li Date: Fri Apr 13 14:06:08 2012 +0800 collie: make some optimization for parse_objs() Currently, parse_objs traverse all the nodes to get the object, and then call the callback function, it's OK for do_print_obj(), but it's a waste for get_data_oid(), when the cluster gets larger with more nodes, it makes collie slower the get the object id. I add a return value for the callback function obj_parse_func_t, if the function returns 1, the loop breaks when the first time it succeeds, if 0, it will traverse the whole node list. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 73b51091f267d0f2062cb39e98dbb93b47a3d6ff Author: Liu Yuan Date: Mon Apr 16 14:00:50 2012 +0800 trace: add GNU liscense to the source files It seems that every .c file has this disclaimer inside, so follow the convention. Signed-off-by: Liu Yuan commit 73bb5efe3e48288002162732b61b98bf5cdf9b01 Author: Liu Yuan Date: Mon Apr 16 14:00:49 2012 +0800 sheep: make gateway and io workers configurable Most of the time one VM would issue multiple requests in one go, so it would be useful to let users to decide how many workers are useful if we have more than several VMs in single sheep node. In a large set nodes of cluster, every single node will get multiple concurrent recovery IO requests, so it would be useful to have io_workers as configurable too. default 4 workers for both and maximum UINT32_MAX. Signed-off-by: Liu Yuan commit f4651b0ace8dc55ce0d02b128437f233d9329e1d Author: Liu Yuan Date: Thu Apr 12 22:43:09 2012 +0800 object cache: make sync flush as default Signed-off-by: Liu Yuan commit 8407d7027efba639bb79141b5bc08d93cd679c53 Author: Liu Yuan Date: Wed Apr 11 21:54:02 2012 +0800 sheep: remove cpg* leftover Now sheep support more than just corosync cluster driver, so we'd better remove cpg* naming from sheep core code. Signed-off-by: Liu Yuan commit 138595da92a2c27958823ee88c2aafb8d9419098 Author: Liu Yuan Date: Wed Apr 11 21:20:33 2012 +0800 sheep: use io workers to run local request There is only one cpg worker, so it should be dedicated to event queue. So we have the following worker strategy: 1 cpg worker for event queue - node & notify event 4 IO workers for request queue - local requests & sheep p2p IO requests 4 gateway workers for request queue - VM guest requests - factor out consistency check code Signed-off-by: Liu Yuan commit 84187361d9e21a6a76d340908f632f36887db0cf Author: Liu Yuan Date: Wed Apr 11 21:20:33 2012 +0800 sheep: remove unnecessary guard check - setup_access_to_local_objects() does the job to find the if the targeted object is local, so we don't need to check it again. - SD_FLAG_CMD_RECOVERY and SD_OP_READ_OBJ should be paired by programmers, since it is just internal API, we should trust 'em. Signed-off-by: Liu Yuan commit b4d17fc7a44745f22e95639bbcf05876a73874b2 Author: Liu Yuan Date: Wed Apr 11 21:20:27 2012 +0800 sheep: refactor process_request_queue() - move away request checking logic from request handling process. This will improve the readibility a bit. - add a new field 'local_cow_oid' in struct request to avoid redundant calls of is_access_local() Signed-off-by: Liu Yuan commit cfbb34e29a7754d6b8d640904b7d0bd3f887b977 Author: MORITA Kazutaka Date: Wed Apr 11 01:56:16 2012 +0900 remove coroutine code It was overkill to use coroutine for group_handler race conditions. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 432115cf52b947c97dfde2635fe6244e40dd3438 Author: Liu Yuan Date: Mon Apr 9 13:52:33 2012 +0800 sheep: split cpg_queue into two We need to split cpg_queue because: 1) IO requests shouldn't block node change events because succeeding event will replace the previous one to mitigate the overhead of unnecessary IOs from recovery. 2) IO requests may not block some notify events such as SHUTDOWN because some nodes are shutdown-ed maybe mistaken as nodes to be left by the blocked nodes. This patch splits the queue with the following characteristics - local requests and IO requests are placed on request queue - confchg and notify event are placed on event queue Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 460154a528153e25fc51bec512f1a950ed217047 Author: Liu Yuan Date: Mon Apr 9 13:52:32 2012 +0800 sheep: process gateway request only in fast path For request with flag SD_FLAG_CMD_IO_LOCAL, we handle it in normal path. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 63eb48136de48c853c7a1ce3bf2827fc775e1f58 Author: Andy Chen Date: Mon Apr 9 17:57:18 2012 +0800 logger: rotate log by size, default is 500MB a simple patch to split log: when log size exceed 500MB, a new sheep.log file will be created, and old log will be renamed with specified suffix(actually, it's the time when split happened). the main purpose of this patch is to prevent log became too big, which is hard to open and read. Signed-off-by: Andy Chen Signed-off-by: Yibin Shen Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit d8b8c981809e5b92d2490ede4f9d78deb91e5ccb Author: Yibin Shen Date: Thu Apr 12 17:57:55 2012 +0800 store: set correct path for farm in store_remove_obj Signed-off-by: Yibin Shen Signed-off-by: Liu Yuan commit 470533b1f5d6515690f903024801d791eada9336 Author: levin li Date: Wed Apr 11 14:50:07 2012 +0800 fix a bug in recovery which makes sheep get an incomplete epoch node list In do_recover_object(), when recovery fail at some epoch, we need to go back to the previous epoch, but get_vnodes_from_epoch() gives an incomplete node list for the specified epoch, so the target node from which we will recover the object is wrong sometime, in that case, recovery always fails, the length we used to read epoch file was shorter than expected, now it's fixed. Signed-off-by: levin li Signed-off-by: Liu Yuan commit a7b69a049c7c61864758060d61b26616efaa7e4e Author: Liu Yuan Date: Tue Apr 10 15:02:07 2012 +0800 trace: support cat operation from collie usage: after tracing the sheep, you can get a function graph: collie debug trace cat - use per thread buffer, so no lock overhead - every buffer has the size of 8M - after trace buffer is catted, the buffer is reset, that is, you can not cat it again to get the output. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 90abff863e91c4e45dd472813d9a3f77d7c44905 Author: Liu Yuan Date: Tue Apr 10 15:02:06 2012 +0800 trace: add a new interface for trace utility usage: collie/collie debug trace start #start tracing sheep collie/collie debug trace stop #stop tracing sheep Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 8847acb4f9f332fae270bd9f1b9260a71ddb59ad Author: Liu Yuan Date: Tue Apr 10 15:02:05 2012 +0800 sheep: fix group_handler face Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 46d63af1e322b8a14296b632822750df1614deca Author: Liu Yuan Date: Tue Apr 10 15:02:04 2012 +0800 sheep: revert coroutine for fixing group_handler() race Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 0c2079129d5d315e6c031ab98d5fde5da8a184d9 Author: Liu Yuan Date: Tue Apr 10 15:02:03 2012 +0800 trace: add a ring buffer Tracers use this ring buffer to store its output internally. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 50400e6223c02ce27de9b43f6be6fe0d34a0ae9e Author: Liu Yuan Date: Tue Apr 10 15:02:02 2012 +0800 trace: add graph tracer - add return hooker Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 75df6bd0b6f8881230a9efcf7355106c336c4de6 Author: Liu Yuan Date: Tue Apr 10 15:02:01 2012 +0800 trace: low-level trace infrastructure proper We achieves this by dynamically patching the binary code at runtime, making use of GCC -pg instrumentation features, which instrument a 5-bytes opcode in every function. When the trace is turned off, the opcode is patched as NOP opcode. turned on, the opcode is a call opcode to our trace functions and we can hook callbacks both for function entry and exit. This is how the 'graph' tracer atually calculate the time spent by function. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 4b02952edae433abd744b318e61360d09fa94c01 Author: Liu Yuan Date: Tue Apr 10 15:02:00 2012 +0800 sheep: export some structures for trace use Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 3934648ef4c9c8b76e84d1ef05ab67ce7cc67c28 Author: Liu Yuan Date: Tue Apr 10 15:01:59 2012 +0800 logger: teach logger funcs to notrace Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit e1aefa6ba97b92beb8835051e4ad2c6229b8d76f Author: Liu Yuan Date: Tue Apr 10 15:01:58 2012 +0800 trace: add low level ip2function support We use GCC's stabs to map IP to its symbol(Function Name). Normally, loader doesn't load symbol into memory, we have to manually code ld script to load it. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit b3484215704f9fcf75fdb693d31445cecf0cb09d Author: Liu Yuan Date: Tue Apr 10 15:01:57 2012 +0800 trace: low level functions in assembly Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 85d0d1be9dd8950079415a2dc2dbd046ed2b67a3 Author: Liu Yuan Date: Tue Apr 10 15:01:56 2012 +0800 trace: driver trace to work Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit fa703f127b838b6c684ea0cb5a31581cb9ad73c3 Author: MORITA Kazutaka Date: Sat Apr 7 02:23:48 2012 +0900 use valloc for reading cow data object Sheep may read the cow data object from local, so we need to use valloc() here. Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 8a84614001f5c5ef83ea86f2ed76e5f4f536185f Author: MORITA Kazutaka Date: Sat Apr 7 02:23:47 2012 +0900 set the maximum number of replication correctly Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit c32c15cee52e1a4e76dc29e6abb0375d72f257e9 Author: levin li Date: Fri Apr 6 18:14:01 2012 +0800 fixed a bug of reading length which may cause core dump. in farm_link(), when we try to put the object from the trunk to the object directory, we specified a const length SD_DATA_OBJ_SIZE, but the object we try to put may has a different length, which may cause core dump in some case. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 6df31cbc2e45216da7fb9e96f886486ee1bd1c4e Author: levin li Date: Thu Apr 5 14:43:35 2012 +0800 fixed a fd leak in read_working_object() Signed-off-by: levin li Signed-off-by: Liu Yuan commit 3c3706241c7c37fbe5cdee39bd6521c760763fd1 Author: levin li Date: Fri Apr 6 17:57:39 2012 +0800 Implement the trunk map in farm with rb-tree instead of file It's too inefficiency since we can only write into an omap file sequentially and everytime the trunk map is updated, we need to update the cache file, so I implement the trunk map with rb-tree, and store it in the memory, not in the cache file, after shutdown of a sheep node,we just need to cleanup all the trunk files when startup again. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 2668c28065d2838d85b2d37d60d3b8872734bb13 Author: levin li Date: Fri Apr 6 18:02:05 2012 +0800 cleanup the data object in system snapshot by user When recovering is over, it's no need storing the system snapshot data any more, since it's just for recovering. So when recovering is complete, we can cleanup the system snapshot to save the disk space, I implement it by adding a new subcommand 'cluster cleanup' to collie. Signed-off-by: levin li Signed-off-by: Liu Yuan commit c390371074193bf1d99ffb634b02e5571b7e6c72 Author: levin li Date: Fri Apr 6 17:40:32 2012 +0800 add a field nr_zones to cluster_info In some path such as the object cache path, we need to determine the copies count with the zones count, but we can't get the sd_vnodes list to calucate the zones count, so it's better to include a nr_zones field in cluster_info, everytime the cluster complete recovering, we recalucate the nr_zones field. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 24ad8544ddf13a36a07fdbcc8ea17723b7e2de3e Author: Liu Yuan Date: Wed Apr 4 17:11:45 2012 +0800 object cache: add flush_and_delete operation If 1) VDI is opened without cache enabled and 2) we unfortunately have a cache for it previously, we should flush the cache then delete it. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 881054ac7d55b58a30ffe71ba0df81343dc52115 Author: Liu Yuan Date: Tue Apr 3 13:25:19 2012 +0800 store: use O_DIRECT IO for backend store Backend store don't need page cache, by this way we make room of memory for object cache to maximize its performance. Signed-off-by: Liu Yuan commit 6da313e826ab3013149a83849c572943b76f6f61 Author: Liu Yuan Date: Tue Apr 3 01:00:26 2012 +0800 object cache: fix wrong usage oc->oid object cache 'oc' is globally shared by all IO threads, so we cannot use oc->oid without lock. Actually, we don't need it at all. Signed-off-by: Liu Yuan commit 8407936377baa3b5e8201f2b560bc429d1dcba7b Author: Liu Yuan Date: Mon Apr 2 16:19:57 2012 +0800 object cache: introduce async flush We async flush dirty object as default to achieve the best performance. If users prefer strong consistency over performance, users can launch sheep with -S or --sync option. We need async flush because: 1) some APP are responsive time sensitive, the writeback of dirty bits in the guest will mostly hurt RT because guest need to await its completion. This is a considerably long operation in the sheep cluster. 2) some APP are just memory and CPU intensive, has little of concern of disk data. (For e.g, just use disk to store logs of APP) 3) People simply prefer performance over consistency. Signed-off-by: Liu Yuan commit 9ca50770fe298e9da5d56dd81483fc396ba14d74 Author: Liu Yuan Date: Mon Apr 2 16:19:16 2012 +0800 object cache: enable direct IO for cache object When sheep is launched with '-D' or '--directio' option, we will use direct IO for object cache too. - object cache default to use both host page cache and disk write-back cache (if any) this means best performance and greedy to use host memory as much as possible. Signed-off-by: Liu Yuan commit 09107378c6dabc8df4388e17c7eeba0038ce5496 Author: levin li Date: Wed Mar 21 09:44:51 2012 +0800 farm: avoid unnecessary IO operation when recovering When the cluster is recovering, we only need to write the objects which no longer belong to the node to the snapshot, instead of writing all the objects, by which we decreased the IO operation. When we try to read an object, we first read it from the local object directory, if not found, then read it from the snapshot. Signed-off-by: levin li Signed-off-by: Liu Yuan commit ca8ae13335ec0c9dc3b645cf24e685a3d7b122a5 Author: Liu Yuan Date: Fri Mar 30 11:14:30 2012 +0800 object cache: cache create operation Currently create operation, we write through the cache. This operation would be slow and return-err especially in node changes phase. It is clean to let gateway only talks to object cache, and rely object cache layer on pulling and pushing objects from/to sheep cluster. With all the gateway requests operated on object cache to boost the IO performance, we can do interested tricks with Farm, such as data de-duplication, which trade cpu cycles for disk space while not hurting the IO performance. Signed-off-by: Liu Yuan commit 519550f9a74b5627319b40bc591feed816e6718b Author: Liu Yuan Date: Fri Mar 30 11:14:20 2012 +0800 object cache: retry pulling from all the copies in failure This retry strategy allow the object cache layer more robust Signed-off-by: Liu Yuan commit 39bc7c851284862d12e7ba95f4c85b24f0a277be Author: levin li Date: Fri Mar 16 10:07:46 2012 +0800 sheep: object list cache applied to farm and simple store Applied the object list cache to get_obj_list() which walk through the rb-tree cache and return the cached list, instead of calling sd_store->get_objlist(), make it much faster to get the object list. Signed-off-by: levin li Signed-off-by: Liu Yuan commit 41762dcfc38043aa18b72fd06d3a88fb6d6cb4a1 Author: levin li Date: Tue Mar 27 10:35:45 2012 +0800 sheep: add object list cache implemented by rb-tree Added object list cache implemented by red-black tree, everytime we creates a new object, we write the oid of the object into the object list cache. Signed-off-by: levin li Signed-off-by: Liu Yuan commit bd2502cd31e3888add8115c00f7dc6918f882f54 Author: Liu Yuan Date: Tue Mar 27 15:09:18 2012 +0800 object cache: implement vdi delete operation Signed-off-by: Liu Yuan commit 12cdeb580527d3dba962440df26b5dfa0215c42a Author: Liu Yuan Date: Tue Mar 27 15:09:18 2012 +0800 collie: read the cached objects if any for collie operation Collie operation such 'vdi list' or 'vdi object' need to read the lateset VDI object, it might be cached in the object cache. We can't pass SD_FLAG_CMD_CACHE to sheep because collie doesn't know if the cache mode is enbled. So we rely on the assumption that the cached object is freshest. Signed-off-by: Liu Yuan commit 16c9caa1984583f26e7b52fb50acf36ec4b406f5 Author: Liu Yuan Date: Tue Mar 27 15:09:18 2012 +0800 sheep: let object cache better interact with recovery logic If the targeted object of the gateway request is cached already, we can simply operate on it despite of node changes. Signed-off-by: Liu Yuan commit 4d8d2d6358e75cea4b84b29d06a9d0df9923fd4b Author: Liu Yuan Date: Tue Mar 27 15:09:18 2012 +0800 sheep: add flush_vdi operation This is supposed to be initiated by Guest OS, but our collie friend might also like it. Flush operation is operated on vdi basis, that is, when one guest flush its own dirty data, other guests are not affected. - use forward_write_obj_req() to flush dirty objects Signed-off-by: Liu Yuan commit 1597ce31a1269ab413c9f81189b571a6bfe43327 Author: Liu Yuan Date: Tue Mar 27 15:09:17 2012 +0800 sheep: teach sheep to use object cache We only intrude IO code for gateway requests. Object IO path from recovery logic is intact. Signed-off-by: Liu Yuan commit f95fbca4d0c4def63612d987278f7a8ced58fb0d Author: Liu Yuan Date: Tue Mar 27 15:09:16 2012 +0800 sheep: object cache proper Object cache caches data and vdi objects on the local node. It is at higher level than backend store. This extra cache layer translate gateway requests into local requests, largely reducing the network traffic and highly improve the IO performance. Dirty objects will be flushed to cluster storage by 'sync' request from guest OS. - use red-black tree to track dirty objects - use file lock to avoid RW race on object granularity - use hash lists to maintain vdi space. - each vid has its own independent object cache Signed-off-by: Liu Yuan commit 0d2d73aa9d40f9cff4a0c3ce48dcbf36c5f3a125 Author: Liu Yuan Date: Fri Mar 23 18:48:43 2012 +0800 sheep: add red black tree library Red black tree is more scalable than hash lists if we cannot predict the size of the data set. This is excerpted from Linux kernel. Signed-off-by: Liu Yuan commit 87e665db29c8b2d1219020a49005ec773db85463 Author: Liu Yuan Date: Fri Mar 23 18:47:43 2012 +0800 sheep: drive object cache to work Signed-off-by: Liu Yuan commit db47b5dd2bedc0d3cc3a44fa85d9dca8ed3ad81e Author: Liu Yuan Date: Thu Mar 29 17:36:21 2012 +0800 Revert "sheep: use pointer to avoid extra copy" This reverts commit ebe6d5289978fc9316d2e4d8330a5b1c6d3ca5bb. Reason from the commit 32ae69d898721887253868ebe3d78b6f927ea6f9 We cannot overwrite the request header in forward_*_obj_req() because its fields are reffered in __done(). commit 0adb1f8f9558b79cdae905d8546831aac9722cc3 Author: HaiTing Yao Date: Fri Mar 23 16:15:31 2012 +0800 sheep: reduce snapshot COW read/write Doing snapshot COW: 1, If new writing request occurs and need COW for sanpshot, now read old object to buffer, then write the buffer to new object, then write the request data to new object. We can merge the latter two writing request. 2, If new writing request covers whole object, no need to read old object. After the modification, pass bigger buffer to do_write_obj when doing COW, but it will not add the burden. COW is never for inode object, so it will not use the journal. Signed-off-by: HaiTing Yao Signed-off-by: Liu Yuan commit 46a50bc5763fccfee684bac860705f8755e5f771 Author: Liu Yuan Date: Fri Mar 16 10:55:44 2012 +0800 farm: fix put_entry() of trunk There is race when trunk entry is put for hash list. Signed-off-by: Liu Yuan commit 7ee8b1b2ee059454b45d50d3097fd9c54ea55ac4 Author: huxinwei Date: Fri Mar 16 02:01:54 2012 +0000 sheep: fix recovery logic It can be cases that, in some epoch, sheepdog cannot maintain the required copies of replications. When recovering from such epoch, we'd better be conservative and double check. Signed-off-by: Xinwei Hu Signed-off-by: Liu Yuan commit 941b37ed4576d85d73bae9d6d929a2bcb8d449c8 Author: HaiTing Yao Date: Thu Mar 15 13:22:33 2012 +0800 sheep: modify 'buf' member of deletion work 1, make it more readable 2, malloc size is not right, not multiply size of uint32_t Signed-off-by: HaiTing Yao Signed-off-by: Liu Yuan commit 603e960d2df270d96d942b29288b0e7c43ef44a8 Author: Liu Yuan Date: Wed Mar 14 10:25:11 2012 +0800 sheep: fix broken data length passed to store Object size for vdi_attr obj is never right, though it didn't cause problem, just bigger than it really needs. - use SD_INODE_SIZE for inode size - fix object size passed to store_create_and_write() Signed-off-by: Liu Yuan commit ebe6d5289978fc9316d2e4d8330a5b1c6d3ca5bb Author: Liu Yuan Date: Sun Mar 11 00:01:40 2012 +0800 sheep: use pointer to avoid extra copy forward_write_obj_req() is kind of hot code path, so this kind of extra copy is nasty. Signed-off-by: Liu Yuan commit f64d72db7e0e9adc60614741c90811d044a79620 Author: Liu Yuan Date: Sat Mar 10 23:58:46 2012 +0800 farm: fix preallocation object size Signed-off-by: Liu Yuan commit dd29f76870ab0678a577c46cf5e3f2ddee5e7ae8 Author: Liu Yuan Date: Sat Mar 10 23:50:36 2012 +0800 simple store: fix preallocation object size We should allocate different sizes for vdi and data object, instead of fixed size for both. Signed-off-by: Liu Yuan commit 8fdf0481dc0b17367fca695bc11b01e5cbee0737 Author: Liu Yuan Date: Sat Mar 10 23:42:55 2012 +0800 sheep: remove unnecessary code This code is never visited and acted as a safe guard, but the uppper layer code should take responsibility of initializing hdr->copies to avoid this redundant check in the hot path. Signed-off-by: Liu Yuan commit 7269aad5645b41fc26367440aed662a3f9e060fb Author: huxinwei Date: Tue Mar 6 09:40:21 2012 +0000 Cleanup the log function The log_sigsegv function in logger.c actually serves two purposes. 1, the logger itself exit on illegal memory access. 2, the sheep exit, which makes the kernel deliver a SIGSEGV to the logger process I think it'll be a good idea to make distinguish between these 2. Signed-off-by: Xinwei Hu Signed-off-by: Liu Yuan commit e6688ae317ab4a882b180a9c2e151e4d7b352d20 Author: Liu Yuan Date: Fri Mar 2 18:01:19 2012 +0800 farm: fix lookup_trunk_entry It should return NULL when 1. not found and 2. create = 0. commit 3ee56b7ca11d62bdd430671b793f4cac5c7f53c3 Author: huxinwei Date: Fri Mar 2 09:11:21 2012 +0000 farm: Fix to put_sha1_file I found an panic in sheep today. Mar 02 23:57:28 trunk_file_write(315) try delete stale snapshot object 8071d67a00000000... Mar 02 23:57:28 put_sha1_file(92) No such file or directory It happens when the sheep is killed after omap_file_init but before omap_file_final, and some of the staled snapshot have been unlinked. Then sheep will always panic every time afterward. The patch is a proposal to make put_sha1_file tolerant not-exist-file. Signed-off-by: huxinwei Signed-off-by: Liu Yuan commit 1dfdcb994fb7e730f92bb82adad1aeaec64382fe Author: huxinwei Date: Fri Mar 2 09:04:42 2012 +0000 sheep: Improve the verbose error message in log. Yet another trivial patch. I just copied over the comments for these results. It helps me to figure out the situation more easiler. Signed-off-by: huxinwei Signed-off-by: Liu Yuan commit 71678c3a9a6a355274f02d2a78a1f63ef44feb9d Author: HaiTing Yao Date: Thu Mar 1 13:41:44 2012 +0800 sheep: fix the memmove bug in del_cpg_node The size is not right, maybe lead to error Signed-off-by: HaiTing Yao Signed-off-by: Liu Yuan commit 86a25e9b0308ae67821e7fb6b79f51f2841fb745 Author: huxinwei Date: Wed Feb 29 08:18:10 2012 +0000 varies printf type typo I found several printf related type errors in 32-bit linux environment. Here's the trivial patch to fix them. - fix a warning (Liu Yuan) Signed-off-by: huxinwei Signed-off-by: Liu Yuan commit 55d9a39736c17dd3af91949aa79875921deb4c84 Author: MORITA Kazutaka Date: Wed Feb 29 01:31:11 2012 +0900 sheep: support specifying the initial count of virtual nodes With this patch, you can set the weight of each node with the -v option. Example: $ ./sheep/sheep /store/0 -p 7000 -z 0 -v 1000 $ ./sheep/sheep /store/1 -p 7001 -z 1 -v 2000 $ ./sheep/sheep /store/2 -p 7002 -z 2 -v 3000 $ ./collie/collie cluster format -c 1 using backend simple store $ ./collie/collie node list M Id Host:Port V-Nodes Zone - 0 10.68.14.1:7000 1000 0 - 1 10.68.14.1:7001 2000 1 - 2 10.68.14.1:7002 3000 2 $ ./collie/collie vdi create image 600M -P $ ./collie/collie node info Id Size Used Use% 0 188 GB 100 MB 0% 1 188 GB 200 MB 0% 2 188 GB 304 MB 0% Total 564 GB 604 MB 0% Total virtual image size 600 MB If you specify zero to the number of virtual nodes, no data will be stored to the node. Reviewed-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 5736d51484e7f878b0f12aff81c98df9c6a3a1e0 Author: Liu Yuan Date: Tue Feb 21 11:21:21 2012 +0800 farm: fix fd leak in fill_entry_new_sha1() Reviewed-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit ccaf90c246b2be6c98e8f2ff1e293572d53e7992 Author: MORITA Kazutaka Date: Tue Feb 28 02:53:46 2012 +0900 sheep/simple_store: fix a NULL pointer exception Signed-off-by: MORITA Kazutaka Signed-off-by: Liu Yuan commit 5eb6b7bc142e8c4567ea997fe9f9715ba6b0832d Author: Liu Yuan Date: Mon Feb 27 20:21:40 2012 +0800 sheep: remove useless code in do_local_io() We don't need to read old epoch in the case of SD_RES_NO_OBJ since do_recover_object() already does this. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 7cb9f67e25cb50e1395526b871c05089256c84f7 Author: Liu Yuan Date: Tue Jan 17 17:10:37 2012 +0800 store: abstract VDI bitmap set-up Sheep needs to set system VDI bitmap when starting up. And VDI object is stored in the underlying backend stores which have different layouts. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 0ba2cc967486c817b71539269226b3e799f57ca3 Author: Liu Yuan Date: Tue Jan 17 17:10:37 2012 +0800 farm: add format() support - add a format() hook to store driver interface. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 151f6e47f9b95eb67d623142f75028f209feddfb Author: Liu Yuan Date: Tue Jan 17 17:10:37 2012 +0800 collie: enable cluster-wide snapshot command Usage: $collie cluster snapshot # snapshot the whole cluster $collie cluster snapshot -l # list user snapshot info $collie cluster snapshot -R epoch # restore to state of targeted snapshot For e.g. below script #!/bin/bash pkill sheep rm store/* -rf for i in 0 1 2; do sheep/sheep -d /home/tailai.ly/sheepdog/store/$i -z $i -p 700$i;sleep 1;done collie/collie cluster format -b farm qemu-img create -f raw sheepdog:test 1G qemu-io -c "write -P 0x1 0 4M" sheepdog:test collie/collie cluster snapshot # Index 1 qemu-io -c "write -P 0x2 4M 4M" sheepdog:test collie/collie cluster snapshot # 2 qemu-io -c "write -P 0x3 8M 4M" sheepdog:test collie/collie cluster snapshot # 3 collie/collie cluster snapshot -l collie/collie cluster snapshot -R 2 ============================================================ OUTPUT: Formatting 'sheepdog:test', fmt=raw size=1073741824 wrote 4194304/4194304 bytes at offset 0 4 MiB, 1 ops; 0.0000 sec (8.142 MiB/sec and 2.0354 ops/sec) wrote 4194304/4194304 bytes at offset 4194304 4 MiB, 1 ops; 0.0000 sec (7.987 MiB/sec and 1.9968 ops/sec) wrote 4194304/4194304 bytes at offset 8388608 4 MiB, 1 ops; 0.0000 sec (9.381 MiB/sec and 2.3452 ops/sec) Index Snapshot Time 1 Fri Dec 23 22:21:05 2011 2 Fri Dec 23 22:21:08 2011 3 Fri Dec 23 22:21:11 2011 Cluster restore to the snapshot 2 ... Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit a67f4167ed4541e32017f3347302b70402105617 Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 sheep: add cluster snapshot/restore support This kind of snapshot is supposed be triggered by user, _not_ by recovery code. I don't think we need to restore to the state at the beginning of the recovery. So this work only permits us to restore cluster to the snapshot initiated by end users, thought it is quite easy to implement to restore to the snapshots forcibly taken by recovery path. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 0299cacfa362e18c75382e01cf8ff24f3c32a0f3 Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 farm: add a documentation for farm internals Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit a929b0078b1f66ac472b8d31f79a1e1408bf3d74 Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 farm: the farm impelmentation proper Farm is a new store for sheepdog with features [compared with Simple Store] - cluster wide snapshot. - node-wide data sharing for snapshot objects with the same content. so snapshoting is very cheap operation. - support 'collie cluster restore snapshot' to restore fully to the state of customized snapshots. - faster recovery. - auto checksumed snapshot objects. - no stale objects that are found in current implementation, that will waste storage a lot. - practically the same performance as Simple Store. A simple qemu-io test on my laptop shows that write read farm 14.840 MB/s 11.211 MB/s simple 14.842 MB/s 11.245 MB/s todo: - consolidate snapshot feature. - more enchancements. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 80cf6f68cff81de2988222136068849a5d766b98 Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 sheep: add end_recover() hook to store. We need to notify underlying store when the recovery ends. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 9967ef7a4f8c29f5e58711b8d7fa53d378b0eaf0 Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 sheep: add begin_recover() hook to store. We need to notify underlying store when the recovery begins. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 920fa628e5d14330477ea61ee2fbcd5a6e685bae Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 farm: add snapshot object snap object is the meta data that describes the snapshot. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 25f0f967eccec4081ad84fe8b2a36852fdc111b0 Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 farm: remove stale object in backend store For storing one object into backend store, either a) no content change, then point to the same old sha1_file (no stale object) or b) content updated, then will point to a new object with a new sha1. So we need to remove stale object in case b), only in the assumption it is the object generated by recovery code. [*] When we try store new snapshot object into the backend store, it is safe and good timing for us to remove the old object with the same object ID. [*] Here I assume we don't need to restore to 'sys epoch' state. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 2901370171aacc0e64873bbc02441836de2f4cfa Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 farm: add trunk object trunk object is meta data that describes the structure of the data objects at the timeline of snapshot being taken. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 0ca5a5b3d46121b0f1a4c855d638828d1fd05a35 Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 farm: add sha1_file operations All the objects(snap, trunk, data) in the farm is based on the operations of the sha1_file. sha1_file provide us some useful features: - Regardless of object type, all objects are all in deflated with zlib, and have a header that not only specifies their tag, but also size information about the data in the object. It's worth noting that the SHA1 hash that is used to name the object is always the hash of this _compressed_ object, not the original data. - the general consistency of an object can always be tested independently of the contents or the type of the object: all objects can be validated by verifying that (a) their hashes match the content of the file and (b) the object successfully inflates to a stream of bytes that forms a sequence of + Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 906370bdddde44669a9bf19be9173daf883d9b33 Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 sheep: check object directory path at start-up Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit ce15e14878679ce15ce2c92ca031d152c5bd80d4 Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 sheep: modify the configures to run farm. Drive farm to work! add two new lib dependency: - libssl Dongsu Park suggests that we should check crypto lib for SHA1_Init with newer libssl. Currently libssl-1.0 and libssl-0.9.8 works with current configuration. For debian based systems, you can install them by the name libssl-dev. Signed-off-by: Dongsu Park Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 9aececd858d223cbbb119c3fd8cce79d0c13d8db Author: Liu Yuan Date: Tue Jan 17 17:10:36 2012 +0800 sheep: transfer store backend for newly joined node When the new node joins the cluster, it doesn't know what kind of backend store the cluster uses, so we need to transfer this information by master node in the join phase. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 732eac8ce83ffe8555734c650e4f02a86ffab613 Author: Liu Yuan Date: Tue Jan 17 17:10:26 2012 +0800 store: add dynamic mechanism to chain the available backend stores. - change global store structure to a pointer - use a list to maintain the stores. - use /obj/.store to remember backend store persistently. - now we can specify the backend store in the command collie cluster format -b farm #use farm if no store specified, currently sheep will use 'simple' store. if specified store not available, collie will return a list of available stores. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit a893faddf53cda4544bc9a731865bb27cfc3d0db Author: Liu Yuan Date: Tue Jan 17 17:09:26 2012 +0800 sheep: hide some minor store layout aware operations We need this to decouple the sheep from kv-store. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 56bfeb25730db3235c8615034ea2c59cc5f56f8b Author: Liu Yuan Date: Thu Feb 23 18:04:23 2012 +0800 sheep: fix recovery logic We should check if join_msg->inc_epoch before starting recovery after a new jode joins. Signed-off-by: Liu Yuan commit a79d604bc4f0e1c19817ffcf5b9826a780a40eb6 Author: Liu Yuan Date: Thu Feb 9 16:40:33 2012 +0800 sheep: fix SD_FLAG_NOHALT collision SD_FLAG_NOHALT and SD_FLAG_CMD_WRITE use the same value. This is wrong. Signed-off-by: MORITA Kazutaka commit 60fb39e1ee2717ac81c37b30901c79a0d00b576d Author: Liu Yuan Date: Tue Jan 17 16:59:15 2012 +0800 sheep: stop serving non-force requests when in halt Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit e1bd76422052f9b25d597b200798f1a0a05a4114 Author: MORITA Kazutaka Date: Fri Jan 6 17:53:19 2012 +0900 sheep: fix cluster multicast Cluster ops with process_main() will be multicasted, so we need to set req->data to msg->data in such cases. Signed-off-by: MORITA Kazutaka commit 08022e944283196881159ed423a8577489b1e150 Author: Liu Yuan Date: Fri Dec 23 14:19:43 2011 +0800 sheep: refactor fill_obj_list() - rename __start_recovery to explicitly say it work in worker thread - rename recover_done. It is not really done for most cases when it is called. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 8ea1a6d4e44b4802866987d22a21ed5ba90d38b4 Author: Liu Yuan Date: Wed Dec 21 16:47:41 2011 +0800 sheep: abstract out 'all or nothing' write operation In recovery, sheep needs to atomically write the object to the store that is being recovered. This operation assume underlying store layout, so we need to abstract it out. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 49fca3f34b28f35979279695f8429daa5f1bbc77 Author: Liu Yuan Date: Wed Dec 21 12:01:48 2011 +0800 sheep: rename sheepdog_{,v}node_list_entry into sd_{,v}node This naming mostly force us to use multiple lines for function paras. We'd better have them terse. This patch is generated by utility 'find & sed'. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 981acd6dd3d31674833ecaf7457d0641f083fffa Author: Liu Yuan Date: Wed Dec 21 11:09:54 2011 +0800 sheep: refactor recovery logic Current recovery logic is elusive and it is not easy to be understood. I hope this work would ease the headache. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 8769bc2c47e7ac33c86af2b8432a02f0a86695b7 Author: Liu Yuan Date: Sun Jan 1 18:34:54 2012 +0800 makefile: add cscope support It is nice to get cscope.out by single line 'make cscope' Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 5a729d2762780e07dbbf2a3a033d45e66b10cc3e Author: Liu Yuan Date: Thu Dec 29 20:00:27 2011 +0800 sheep: use eprintf instead of fprintf(stderr) These fprintfs will be called in the context sheep, so we have to use eprintf for err messages. - coroutine: use panic() instead of fprintf() & abort() Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit bb41896a21ef662bd8ad1490b8eb986e3fa53ac6 Author: MORITA Kazutaka Date: Thu Dec 29 02:03:36 2011 +0900 sheepdog 0.3.0 Signed-off-by: MORITA Kazutaka commit d08d21dac3fb86a65dc50395314cdf17c7160428 Author: MORITA Kazutaka Date: Fri Dec 16 06:47:54 2011 +0900 sheep: set poll timeout Signed-off-by: MORITA Kazutaka commit ce8900befa8fcc0d8113b5d3f6546c4ec1ccba54 Author: MORITA Kazutaka Date: Fri Dec 16 06:44:08 2011 +0900 sheep: close cached fd when network error happens Signed-off-by: MORITA Kazutaka commit ba760df55799aa588a2fe71bbbde340032cfb88b Author: MORITA Kazutaka Date: Fri Dec 16 06:17:52 2011 +0900 sheep: write zeros when creating objects Signed-off-by: MORITA Kazutaka commit 8fd5959a85b4925c8553f9a5bb007b4559d15895 Author: MORITA Kazutaka Date: Fri Dec 16 06:54:18 2011 +0900 use __thread to simplify code This also fixes a problem that get_sheep_fd() is not thread-safe. Signed-off-by: MORITA Kazutaka commit bc042401d919502607714f462180de9941a8d035 Author: Liu Yuan Date: Wed Dec 21 00:22:03 2011 +0800 sheep: fix epoch_log_read_nr() We should propagate error out to the caller. - remove whitespace in passing Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 308aa40755c6196b94ebbd91c739d86318c7e49d Author: Liu Yuan Date: Fri Dec 16 17:48:52 2011 +0800 sheep: abstract out link operation for storage Currently if we recover the object from local old epoch store, we simply do a hardlink to the old object. Since this operation holds some assumption about underlying object store layout, we'd abstract it out. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit ad68f07d2d8fb8c9698c6f4fcb2f3ed7150f776f Author: Chris Webb Date: Sat Dec 17 09:00:55 2011 +0000 Don't report an error for blocks not stored locally Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit d71e4f845fda2ec63883039d34e216f34ccebc8b Author: Chris Webb Date: Tue Dec 13 17:28:34 2011 +0000 Correct the plural of 'one sheep' to be 'two sheep' Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit 66c2b20e7be38e2206c48fa8d3d5812e93c2d271 Author: Chris Webb Date: Tue Dec 13 17:26:23 2011 +0000 Standardize error messages and tidy up remaining messages This patch includes a number of very trivial consistency and language fixes: - Standardize collie error messages printed to stderr to begin with a initial capital like other standard unix tools and the C library. (Messages written to the sheep.log are already standardized to be uncapitalised.) - Ensure all error messages are printed to stderr not stdout. - Fix some awkward wording, typos and grammatical issues in the messages. - Arrange for all of the tabular output from different collie commands to be similarly formatted and headings capitalised. - Spell VDI as 'VDI' not 'vdi' nor 'Vdi' in messages and comments. Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit a84b57b4670183aa8bf272c974f3d827c1e1d706 Author: MORITA Kazutaka Date: Tue Dec 13 16:13:23 2011 +0900 sheep: remove undefined struct Signed-off-by: MORITA Kazutaka commit 0eea01011cb79419d49e60e5c44a9a7a998521c4 Author: MORITA Kazutaka Date: Tue Dec 13 14:28:11 2011 +0900 bash_completion: add support for new collie commands Signed-off-by: MORITA Kazutaka commit 8a1f0b760d991e0d0de9ff1f4adec70eb27c5683 Author: MORITA Kazutaka Date: Mon Dec 12 19:54:24 2011 +0900 collie: use gateway for read I/O requests Currently, collie sends read I/O requests without using gateway node but calculating the target node as the sheep daemon does internally. This is a hack to keep consistency even if the VM sends write requests to the same objects at the same time. But this enforces us to retry the collie command manually when it fails due to node membership changes. This patch uses a gateway node for collie's read I/Os, and makes them like QEMU's ones. Data consistency of QEMU is not a problem if we use SD_FLAG_CMD_WEAK_CONSISTENCY for collie's I/Os. Signed-off-by: MORITA Kazutaka commit 005efb91cc50616811505f5ea66e031e632c5b70 Author: MORITA Kazutaka Date: Mon Dec 12 19:54:02 2011 +0900 add support for reading objects without strong consistency To keep strong consistency, Sheepdog assumes that no two VMs can open the same VDI at the same time. This patch relaxes it and allows us to read data from opened VDIs by specifying SD_FLAG_CMD_WEAK_CONSISTENCY. Note that the obtained data may not the latest one. Signed-off-by: MORITA Kazutaka commit 89f7e1f95c466f07999dfc0fc4f9e9ccd0c60ca2 Author: MORITA Kazutaka Date: Mon Dec 12 17:31:25 2011 +0900 sheep: resume pending I/O requests after setting next recovery We need to set recovery_work before calling resume_pending_requests() because sheep checks the value to decide whether object recovery is running or not. Signed-off-by: MORITA Kazutaka commit 07945a7e4f046fe0301b681a8756e67a4e4431e8 Author: Liu Yuan Date: Tue Dec 13 15:43:28 2011 +0800 sheep: simplify get_vdi_bitmap_from debug info Current debug info for e.g, "get_vdi_bitmap_from(474) getting the vdi bitmap from 127.0.0.1" looks rather redundant. So let's cut it off. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 5b49c4ddc2bb082f00fa55853dec9ebfd40a2668 Author: Liu Yuan Date: Tue Dec 13 15:27:29 2011 +0800 sheep: abstract out get_obj_list() Current recovery logic needs to query underlying object store of object list in the specified epoch, in order to calculate object relocation. We need to abstract it out to adopt sheepdog to more stores. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 74890c48f797acfbb8fc9e0ff4f9592b4977a607 Author: MORITA Kazutaka Date: Mon Dec 12 17:31:09 2011 +0900 sheep: set send/recv timeout send()/recv() could sleep long time if network failure happens during network I/Os, and it prevents from incrementing epoch number because we assumes that there is no outstanding I/O requests while updating node membership info. This patch fixes the problem. It is not a problem to set a small value for timeout because I/Os are retried automatically even if send/recv timeout has occurred. Signed-off-by: MORITA Kazutaka commit 7832ac6dee574826cd39effef943c0b322b0948f Author: Liu Yuan Date: Wed Dec 7 10:55:54 2011 +0800 sheep: fix hval_to_sheep() get_nth_node() doesn't return negative val, so we don't need to check it. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit e0a8b4d0a8debdc9c96a61a8792d284ac7bb3a85 Author: Liu Yuan Date: Mon Dec 5 14:39:18 2011 +0800 sheep: split merge_objlist() We actually use merge_objlist() for two different purposes, that is screening and mergeing. Let's split them out to make code more readable, since nothing these two share. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 7202d92ff240f5a9ea8a85f34eea7738069eb449 Author: Liu Yuan Date: Mon Dec 5 14:39:16 2011 +0800 sheep: clean up fill_obj_list() We already have vnodes layout in 'rw' and it is safe to use it instead of recalculating vnodes again. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 1b596ac9112763ab8424dd75cbcca38945cebbcd Author: MORITA Kazutaka Date: Fri Dec 2 12:50:03 2011 +0900 cluster/corosync: stay in the cluster even after cdrv->leave() is called Although the gateway node cannot work as a storage server, it must stay in the cluster to keep a consistent hash ring up to date. Signed-off-by: MORITA Kazutaka commit 377a512db9f51e12cd6fa2fc0e6b40ace65946dd Author: MORITA Kazutaka Date: Mon Nov 28 02:52:59 2011 +0900 sheep: use valloc for data object I/O Data objects may be opened with O_DIRECT, so we must allocate aligned memory here. Signed-off-by: MORITA Kazutaka commit 12a0c1be5b862fe7c1f543bbfe9b9c72bd4527d5 Author: MORITA Kazutaka Date: Thu Nov 24 12:59:16 2011 +0900 sheep: fix cluster event sequences Cluster drivers cannot call a 'check_join_cb' callback before sheep finishes the previous event handling. The simplest approach to solve this is: 1) call coroutine_yield() before exiting the event handler, and 2) re-enter the coroutine after the event is completely processed. Signed-off-by: MORITA Kazutaka commit 145303bd3f5151b330ea1d68060406130d0fbe61 Author: MORITA Kazutaka Date: Thu Nov 24 02:24:17 2011 +0900 introduce coroutine We have many works that need to be processed cooperatively (e.g. object recovery, data I/Os, cluster management). This library will enable us to implement them simply and elegantly. These files are copied from Accord project: https://github.com/collie/accord The following comments are based on qemu git log 00dccaf1: Asynchronous code is becoming very complex. At the same time synchronous code is growing because it is convenient to write. Sometimes duplicate code paths are even added, one synchronous and the other asynchronous. This patch introduces coroutines which allow code that looks synchronous but is asynchronous under the covers. A coroutine has its own stack and is therefore able to preserve state across blocking operations, which traditionally require callback functions and manual marshalling of parameters. Creating and starting a coroutine is easy: coroutine = coroutine_create(my_coroutine); coroutine_enter(coroutine, my_data); The coroutine then executes until it returns or yields: void my_coroutine(void *opaque) { MyData *my_data = opaque; /* do some work */ coroutine_yield(); /* do some more work */ } Yielding switches control back to the caller of coroutine_enter(). This is typically used to switch back to the main thread's event loop after issuing an asynchronous I/O request. The request callback will then invoke coroutine_enter() once more to switch back to the coroutine. Note that if coroutines are used only from the main thread, they will never execute concurrently. This makes programming with coroutines easier than with threads. Race conditions cannot occur since only one coroutine may be active at any time. Other coroutines can only run across yield. Signed-off-by: MORITA Kazutaka commit c4e3559758b2efdfbd0d145426a0e7b229438c42 Author: Liu Yuan Date: Wed Nov 30 20:00:00 2011 +0800 cluster, corosync: do mastership transfer when master is down in join phase If master is down before sending response in join phase, we have to revoke its mastership to avoid cluster hanging. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 47d6d863101c53d1c23d66608864754cf810af98 Author: Liu Yuan Date: Wed Nov 30 19:59:59 2011 +0800 cluster, corosync: enlarge is_master() audience We need this to do mastership transfer in join phase. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit d1b3364ec6ddc07c2606755d189b44f2ac667ef5 Author: Liu Yuan Date: Wed Nov 30 19:59:58 2011 +0800 cluster, corosync: add two dprintf It is helpful, really. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c8b872d71cc6759049a8565217414793ed6c737b Author: Liu Yuan Date: Fri Nov 25 12:04:12 2011 +0800 sheep: fix vdi information lost bug When sheep is in halt status, we still need to progress to get vdi bitmap in __sd_join(). Reported-by: Jiang Wei Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c29ddc03841d89c299002c00df7c561cc2962807 Author: MORITA Kazutaka Date: Mon Nov 21 15:30:03 2011 +0900 skip recovery when there is a pending recovery work Signed-off-by: MORITA Kazutaka commit 857ca6aba27dfb6487f2ce2445bfe35d528bece9 Author: MORITA Kazutaka Date: Mon Nov 21 15:27:33 2011 +0900 process only the latest epoch recovery This improves the performance of recovery when multiple node failure occurs. Signed-off-by: MORITA Kazutaka commit 1214e8a6d0561997ff8e205c81ba8ea9ab568418 Author: MORITA Kazutaka Date: Mon Nov 21 15:17:29 2011 +0900 reset retry_cnt before calling __fill_obj_list() Signed-off-by: MORITA Kazutaka commit c9c4625de6c2b942fa6a6843ee18038d48f05948 Author: Liu Yuan Date: Thu Nov 24 20:15:52 2011 +0800 collie: fix an typo in vdi object command output Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit a2fd195dd8b1f2dc3c883bc11f4e301187faf419 Author: Liu Yuan Date: Thu Nov 24 11:55:01 2011 +0800 sheep: don't exit when sheep calls leave_cluster() When some unrecoverable error happens, sheep daemon will leave the cluster but stay as a gate to redirect requests. For e.g, fllowing case is sheep meets an EIO ... Nov 24 10:36:15 do_io_request(785) failed: 2, 2, 7c2b2500000000 , 1, 3 Nov 24 10:36:15 io_op_done(147) leaving sheepdog cluster Nov 24 10:36:15 sd_leave_handler(1291) network partition bug: this sheep should have exited Nov 24 10:36:15 log_sigsegv(358) logger pid 8255 exiting abnormally ... Thit has nothing to do with network partition stuff. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit fa30aa545e41b72f2309e003c9335445c43b9223 Author: Liu Yuan Date: Thu Nov 24 14:45:07 2011 +0800 collie: fix vdi_object() read size This fixes the bug for command 'collie vdi object image -i x' Since now we don't support short read, for data object, we have to pass the exact size, or sheep daemon will error out. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 839b2e4c4c9bc35fc06296ac76333070d9addff9 Author: Liu Yuan Date: Tue Nov 22 15:31:03 2011 +0800 sheep: use do_process_work() to handle io request Since we already have a low level framework to handle requests, let's use it to handle io requests too. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 948d110669c0108f09d3d11a2cd78b6dc4aff532 Author: Liu Yuan Date: Tue Nov 22 15:31:02 2011 +0800 sheep: refactor local and io request handling They don't share any code or logic, let's split 'em out. - add a new function to handle local request. other minor changes: - rename store/cluster_queue_request into do_io/cluster_request to conform naming in ops.c Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit aa8234c6b29a4b37e4710e95aa65fca63ab0e17e Author: Liu Yuan Date: Tue Nov 22 15:31:01 2011 +0800 sheep: unify cow object and regular object writing path This is necessary to do further unifying of sheep requests handling. small changes on other: - remove read_from_one and merge it, make it return sd result. - rename read_from_other_sheep into read_copy_from_cluster Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit a9f926e7152a6f8ed87087819c9f3d4f7ae65f8b Author: MORITA Kazutaka Date: Fri Nov 18 19:09:08 2011 +0900 sheep: return error when read/write cannot process full-length data Sheepdog block driver doesn't expect that SD_OP_READ/WRITE_OBJECT processes less data than requested, so we should return SD_RES_EIO in that case. With this patch, we can return the result code in read_object() and make code readable. Signed-off-by: MORITA Kazutaka commit 1769d1ae058bfda9e0523bc70bf76cab44951586 Author: MORITA Kazutaka Date: Fri Nov 18 18:43:07 2011 +0900 reduce the maximum size of vdi attributes from 4 MB to 64 KB This allows us to make simple_store_read()/write() fail when it cannot read/write full length data. This patch can also remove SD_FLAG_CMD_TRUNCATE. Signed-off-by: MORITA Kazutaka commit 9cecc570956ef61981f3d722a7e69649ef600194 Author: Liu Yuan Date: Fri Nov 18 13:53:28 2011 +0800 sheep: abstract out store IO interface We need to abstract out store IO interface to adopt it to other IO store such as the coming 'Farm' store. The open/read/write/close is cumbersome for a common kv-store to work with, but this interface request smallest changes to current sheep store code. It sucks but works as a kludge. Don't get me wrong that I am writing an universal interface that will work well with different kinds of data stores, say, sql-store, non-sql store, unstructured store, the store that is not with local backing stroage, etc. Simply I am *not* and I am always lost to foresee the future. This interface is stupid but simply enough that costs me smallest changes to existing code to let Sheepdog work with current store implementation and the coming 'Farm' store. I think those kind people who try to squeeze other useful stores into Sheepdog are at a better position to cook a more generic interface in the future. - Why include length, offset that many kv stores don't need at all? Okay, we'er trying to implement huge data size, so we need these to do partial object read/write. - Why 'int fd' instead of a void *opaque for store object handle? I suppose file is everything in UNIX philosophy and so fd can name everything and I hate type conversion and frown when I can't cscope what it means for one second. And last, I am happy to see anybody prove me wrong and replace it with a more capable interface. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 298d86c1432c64a91ffd556e043f46ef5baff8c4 Author: Liu Yuan Date: Fri Nov 18 13:53:27 2011 +0800 journel: move data commiting out of jrnl_perform() Let jrnl_perform just concentrate on journeling stuff, not intrude in store IO. This would make store IO interface abstracting easier and cleaner. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 45eb24f01f8a61db83cb8c330d9baa1e77c20423 Author: Liu Yuan Date: Thu Nov 17 18:03:21 2011 +0800 sheep: use sys_stat_* helper to check status Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit b483b4c8942ea75787b1dab84fce78653a44b7ae Author: Liu Yuan Date: Thu Nov 17 17:58:41 2011 +0800 logger: quiet gcc about write() use xwrite() instead of write() to get rid of below kindly warning: logger.c:276: warning: ignoring return value of ‘write’, declared with attribute warn_unused_result Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit bfbb2f3315f27bfbfb82b518b993ca41f8dc2c98 Author: Liu Yuan Date: Thu Nov 17 17:50:03 2011 +0800 sheep: add string buf candy helpers This is almost taken from git. Thank git if you find it useful. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 59930511b64775acd0dd57be7891937f871775aa Author: Liu Yuan Date: Thu Nov 17 17:50:04 2011 +0800 sheep: add hlist candy helpers Taken from Linux kernel. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit aa43916f0621a0b036658aa1a97b3107f555b5ec Author: Liu Yuan Date: Thu Nov 17 17:50:02 2011 +0800 sheep: add some candy helpers in util.c These are trivial helper wrappers around standard IO functions and interger hash function. "stolen" from git and Linux kernel. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit d4a176a17d4a67540de74cdcb94b8c46ad0360ed Author: Liu Yuan Date: Thu Nov 17 17:50:01 2011 +0800 sheep: modify Makefile.am for candy helpers. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit dfec954511c3ea9352e11bd5662a6d7e1b51c759 Author: MORITA Kazutaka Date: Wed Nov 16 14:32:21 2011 +0900 sheep: fix uninitialized value in sd_join_handler() The value 'w' is unallocated when the join result is CJ_RES_MASTER_TRANSFER. Signed-off-by: MORITA Kazutaka commit aaef69788a4b6d58f7e0860c6641fdf75d2469f4 Author: MORITA Kazutaka Date: Tue Nov 15 08:42:39 2011 +0900 cluster: add accord cluster driver This adds initial support for the Accord cluster driver. Usage: $ sheep /store -c accord:[accord server address] TODO: - use asynchronous Accord APIs - use watch notification instead of loop and sleep - use transaction instead of global distributed lock Signed-off-by: MORITA Kazutaka commit bf476af3cc33c7bc43a77cd6ab9f7c22431a35cf Author: Christoph Hellwig Date: Mon Nov 14 10:49:00 2011 -0500 sdnet: tidy up queue_request Use a switch for the system status, and use a common done goto labels for all cases that want to complete the request and return. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 6776e64acc65006756ca868bb2faf9f42877c0b2 Author: Christoph Hellwig Date: Mon Nov 14 10:48:41 2011 -0500 sdnet: split up __done Split the __done function into one helper per operation type given that there is no shared code between the different types. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit d7d98434f92b53fae7c424cfa48f22f312b9ea14 Author: Christoph Hellwig Date: Mon Nov 14 10:45:33 2011 -0500 fix a compiler warning in forward_write_obj_req rlen is never used in the function, and recent gcc complains about this fact. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 230ab38a7d405e04a55b419082fb30883baaa05a Author: CHEN Baozi Date: Mon Nov 14 15:45:54 2011 +0800 tests: add qemu-io testcases. Signed-off-by: CHEN Baozi Signed-off-by: MORITA Kazutaka commit 1340c11187d32682e99891108b48c83d7c101168 Author: CHEN Baozi Date: Fri Nov 11 23:17:22 2011 +0800 tests: add test_io method to support qemu-io test. Also fixed some python grammar bugs. (missing "self." when refering member variable in Python class) Noticed that the subprocess.PIPE in python has limited size. I redirect it to None after the node has joined Sheepdog successfully, or it would lead a dead-lock when the pipe becomes full. Signed-off-by: CHEN Baozi Signed-off-by: MORITA Kazutaka commit 2457989c839d35d57ecbdfc17fd3212dd127df40 Author: Christoph Hellwig Date: Fri Nov 11 07:49:48 2011 -0500 store: use fallocate when allocating new objects Writing zeroes into the last sector of an object is not going to preallocate it, but just allocates the last sector. This leads to fairly nasty fragmentation. Use fallocate on the whole object instead. On my test setup with XFS this speeds up writes to an unallocate volume from ~73MB/s to ~80MB/s. If the filesystem does not support fallocate we fall back to the old code. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit d7021eb3f8a4dd87afc504c0d7ebf7b03494e276 Author: Christoph Hellwig Date: Fri Nov 11 07:49:26 2011 -0500 store: split store_queue_request_local Split store_queue_request_local into one function for each command. While this leads to a small amount of duplication it keeps the code nicely separated and helps with adding new commands. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 95be7632c5154226771f6998ba56df8cb74874e0 Author: hch@infradead.org Date: Fri Nov 11 07:49:11 2011 -0500 enable silent make Don't display the compiler command line by default, and let errors stick out more clearly. If needed make V=1 shows the full command line again. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 10ec079a6ae5c28d410aa031b24c628fcd1bb169 Author: MORITA Kazutaka Date: Fri Nov 11 21:19:53 2011 +0900 Revert "store: propagate open failure in store_queue_request_local" This reverts commit 5d513a0f21cbdc143599441159347655ba72c455. Conflicts: sheep/store.c Signed-off-by: MORITA Kazutaka commit 873d74b5c3664d2e883fd06b0c20e9d95e4da3ef Author: Christoph Hellwig Date: Thu Nov 10 16:56:10 2011 -0500 O_DIRECT is not a replacement for O_DSYNC Even if a file is opened with O_DIRECT we still need O_DSYNC / fdatasync to make sure all metadata required to find the data made it to disk. Also clean up the flags handling in ob_open a bit. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 9c043ddf643c73822acda1b73b1a2c87b21ca57e Author: Christoph Hellwig Date: Thu Nov 10 16:55:43 2011 -0500 use O_DSYNC instead of O_SYNC Using O_DSYNC means we do not have to write out the inode if we are overwriting full allocated blocks. For sheepdog that is a fairly usual use case when blocks in an image has already been allocated and the guest OS overwrites previously deleted blocks with new data. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 7aa94aa37b3f80d21e17c9daf6c7cd88e33c014f Author: Christoph Hellwig Date: Thu Nov 10 14:22:29 2011 -0500 store: clean up store_queue_request_local a bit The SD_OP_WRITE_OBJ/SD_OP_READ_OBJ and SD_OP_CREATE_AND_WRITE_OBJ share no code, so split them apart. Also us O_TRUNC instead of calling ftruncate to zero after opening for the SD_OP_CREATE_AND_WRITE_OBJ case. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 5d513a0f21cbdc143599441159347655ba72c455 Author: Christoph Hellwig Date: Thu Nov 10 14:22:28 2011 -0500 store: propagate open failure in store_queue_request_local Currently store_queue_request_local returns success when an open fails, change this to SD_RES_EIO to indicate failure. It might make sense to make the failure more specific, but this at least fixes the bug for now. Signed-off-by: Christoph Hellwig Signed-off-by: MORITA Kazutaka commit 63c538ba3f6d19d58f48f91b5d3148e75394a783 Author: MORITA Kazutaka Date: Tue Nov 8 23:13:54 2011 +0900 cluster: add zookeeper cluster driver This adds initial support for the ZooKeeper cluster driver. To use this driver, please specify comma separated host:port pairs (each corresponding to a ZooKeeper server) to the driver option. For example: $ sheep /store -c zookeeper:host1:3000,host2:3000,host3:3000 TODO: - use asynchronous ZooKeeper APIs - use watch notification instead of loop and sleep Signed-off-by: MORITA Kazutaka commit b299028acd35bbba1dcf4eb17b56da71fd3a8b36 Author: MORITA Kazutaka Date: Mon Nov 7 14:02:36 2011 +0900 cluster/local: fix return value Signed-off-by: MORITA Kazutaka commit 7e6fc6374f5521ed0c324ffd4a27e863ba97e4c5 Author: MORITA Kazutaka Date: Fri Oct 28 02:01:53 2011 +0900 tests: add sample testcases Signed-off-by: MORITA Kazutaka commit 2bc8383a657f12a80dba698005790e1a4186a7d5 Author: MORITA Kazutaka Date: Fri Oct 28 02:01:27 2011 +0900 tests: add Sheepdog cluster emulation library This python script supports: - create a virtual Sheepdog cluster with any number of nodes on localhost - start/stop sheep daemons on the virtual cluster - run collie commands on the virtual cluster - emulate read/write requests from VMs on the virtual cluster Signed-off-by: MORITA Kazutaka commit 781acb03cea72edee442432e638addd071d8fe0c Author: MORITA Kazutaka Date: Fri Oct 28 02:00:22 2011 +0900 sheep: add debug output to use for testing Signed-off-by: MORITA Kazutaka commit 2cfc08dd05e7fec5b48846572653bc000234bde8 Author: MORITA Kazutaka Date: Thu Oct 27 01:31:40 2011 +0900 add make check support for testing If you run 'make check', it will find python scripts in 'tests' directory and execute all methods which starts with 'test_'. FIXME: test/Makefile.am is tricky Signed-off-by: MORITA Kazutaka commit 075306fb237181c15018dac65b49b20913b1d477 Author: MORITA Kazutaka Date: Sat Oct 29 16:20:27 2011 +0900 sheep: use local cluster driver when corosync is not available Signed-off-by: MORITA Kazutaka commit 9579ab204ec6a6ac5d039bdab7256e9aef69aa07 Author: MORITA Kazutaka Date: Sat Oct 29 16:35:11 2011 +0900 make corosync cluster driver an optional feature Signed-off-by: MORITA Kazutaka commit 1a452cd861a7eaf0e96aea42e84eaede40efe93d Author: MORITA Kazutaka Date: Thu Oct 13 20:35:24 2011 +0900 cluster: add local cluster driver This driver uses a local file to share information with other nodes. usage: $ sheep /store -c local:/shared/file/name If you don't specify a name of the local file, /tmp/sheepdog_shm will be used. Signed-off-by: MORITA Kazutaka commit ab96a77e7b706276e963e6e3702071aab3201d97 Author: MORITA Kazutaka Date: Sat Oct 29 16:20:27 2011 +0900 sheep: add support for cluster driver-specific option You can specify cluster driver-specific options as follows: $ sheep /store -c [name]:[options] Signed-off-by: MORITA Kazutaka commit ff0725fbea45b78343977e642156ee375d7ec373 Author: MORITA Kazutaka Date: Thu Oct 13 20:35:24 2011 +0900 sheep: initialize workqueues just before event_loop() This patch enables us to use signal in cluster drivers. Signed-off-by: MORITA Kazutaka commit 21c70bc012a9a974b51982653343e1c627ae5207 Author: Chris Webb Date: Fri Nov 4 17:51:53 2011 +0000 Tidy up sheepdog error textual representations Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit 4ebb966cfb3c78f12af6e5fff79753023cabc7e1 Author: Chris Webb Date: Fri Nov 4 17:08:36 2011 +0000 replace foo (bar, baz) with foo(bar, baz) in a couple of places Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit f3bacbe34de9b7661445cb2ae1c9eafd45e6b3a9 Author: Chris Webb Date: Fri Nov 4 17:08:35 2011 +0000 sheep: more message tidying and standardization Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit 1240c53b218d8c2cd8b501f2966794fd94882388 Author: Chris Webb Date: Fri Nov 4 17:08:34 2011 +0000 sheep: some simple command line wording and consistency clean-ups Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit a5395ace3e4e3db82460f01e687247043d9aa0d0 Author: Chris Webb Date: Fri Nov 4 17:08:33 2011 +0000 correct the plural of 'one sheep' to be 'two sheep' everywhere Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit 553583cf7840f6c6ded0ec336206a29226182661 Author: Chris Webb Date: Fri Nov 4 17:08:32 2011 +0000 sheep: standardize format of strerror-style messages We standardize on wording of the form 'failed to ...: ' followed by the strerror() message, and where possible, we follow the bulk of the existing code in using %m rather than %s with strerror(errno). Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit b71565236fb4622182b44b6681cecd9654d2d442 Author: Chris Webb Date: Fri Nov 4 17:08:31 2011 +0000 sheep: always use the same error message for out-of-memory Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit f727a9146612500a42ab8786393b0951e8db09da Author: MORITA Kazutaka Date: Thu Nov 3 18:55:36 2011 +0900 collie: fix offset in vdi_read/vdi_write Signed-off-by: MORITA Kazutaka commit 52cda86bfb2d6997710cb251d27b0dc350576f78 Author: MORITA Kazutaka Date: Wed Nov 2 19:08:21 2011 +0900 configure: fix indentation Signed-off-by: MORITA Kazutaka commit 6461a797b7400693cca9ffe8ac50ad8bca77b9dd Author: MORITA Kazutaka Date: Wed Nov 2 15:58:32 2011 +0900 sheep: cache virtual nodes to reduce memory usage Signed-off-by: MORITA Kazutaka commit a995705bd5281e2e67d00065d742022778858180 Author: MORITA Kazutaka Date: Wed Nov 2 15:51:56 2011 +0900 sheep: block I/O requests under high memory pressure This fixes an OOM bug which occurs when there are many I/O requests. Signed-off-by: MORITA Kazutaka commit 09e5278d0690ba8b65fe545dd0ea207070d7ecfb Author: MORITA Kazutaka Date: Tue Nov 1 16:09:39 2011 +0900 cluster/corosync: fix memory leak Signed-off-by: MORITA Kazutaka commit 55de30c25d6741474342a8ef3dffa18664ea370a Author: MORITA Kazutaka Date: Tue Nov 1 14:50:01 2011 +0900 sheep: return error when object cannot be fully replicated This is necessary to keep strong consistency. Signed-off-by: MORITA Kazutaka commit d56e3b6d6d9b278dad7af619c15093ca1f09fb5f Author: Yibin Shen Date: Tue Oct 25 14:55:37 2011 +0800 sheep: fix a network partition issue In some situation, sheep may disconnected from corosync instantaneously, at the same time, both sheep and corosync will keep running but none of them exit, then the disconnected sheep may receive a confchg message from corosync which notify this sheep has left. that will lead to a network partition, this patch fix it. Signed-off-by: Yibin Shen Signed-off-by: MORITA Kazutaka commit 67932488517f84469e833e758b8587d0197ece67 Author: MORITA Kazutaka Date: Wed Oct 26 16:51:16 2011 +0900 sheep: fix a race condition for pending vdi_op requests After we exit sd_notify_handler(), the cluster driver can call the next block_cb(). So we need to remove the vdi request from pending_list in sd_notify_handler(). Signed-off-by: MORITA Kazutaka commit d9b3e48ca5f5373266dd873096b774f0899c7442 Author: MORITA Kazutaka Date: Wed Oct 26 15:29:55 2011 +0900 sheep: fix vdi operation order Signed-off-by: MORITA Kazutaka commit 8830b112bd8fe2296e0372e880d5e5cd29cadd2a Author: Yibin Shen Date: Thu Oct 27 09:54:11 2011 +0800 sheep: use correct length for cpg_name Signed-off-by: Yibin Shen Signed-off-by: MORITA Kazutaka commit c9cb7aac55a2eb9ef05205cab08728e11dc5d881 Author: Liu Yuan Date: Wed Oct 26 21:18:16 2011 +0800 sheep: rename var ctime to ct slience the gcc warning of shadowing a global symbol. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit d763dc5b2a8bcb9752ab0c82bf18cf18cac83584 Author: Liu Yuan Date: Wed Oct 26 21:18:15 2011 +0800 sheep: timestamp the epoch log Timestamped epoch is supposed to help users get more information about cluster. Currently, sheepdog just write epoch log without timestamp. This patch stores timestamp at the end of the epoch log file. These timestamps will be seen by collie cluster info command as follows: Cluster status: running Cluster created at Wed Oct 26 10:32:44 2011 Epoch Time Version 2011-10-26 11:09:38 5 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7004] 2011-10-26 11:09:32 4 [192.168.0.1:7001, 192.168.0.1:7002] 2011-10-26 10:38:14 3 [192.168.0.1:7002] 2011-10-26 10:38:13 2 [192.168.0.1:7001, 192.168.0.1:7002] 2011-10-26 10:32:44 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002] Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit ed7875fa902701bca46802d5395ae6def499c2f5 Author: Liu Yuan Date: Wed Oct 26 21:18:14 2011 +0800 sheep: use update_epoch_log() to write epoch logs We should unify the function calls to write epoch logs and we just need one. epoch_log_write() will be removed in next patch. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit ebc11a488d211fba0c82bba6b1941a1fffc195a4 Author: CHEN Baozi Date: Wed Oct 26 15:24:30 2011 +0800 Add missing cluster.h in sheep/Makfile.am. Signed-off-by: CHEN Baozi Signed-off-by: MORITA Kazutaka commit bbe0b928e838b0f85d475361ae73b7c67bde8327 Author: CHEN Baozi Date: Wed Oct 26 15:24:29 2011 +0800 Modified m4 macros in configure.ac Support getting version from git repo in redhat. Signed-off-by: CHEN Baozi Signed-off-by: MORITA Kazutaka commit 729ec53cbc4df1352a8d6e3c30c6ae375f7cf21d Author: Liu Yuan Date: Wed Oct 26 10:23:53 2011 +0800 sheep: add SD_OP_RECOVER operation With this patch, manual recovery command starts working. [Test] script1: for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done collie/collie cluster format -H collie/collie cluster shutdown; sleep 1 # node 1 2 permanently down for i in 0; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done for i in 0; do ./collie/collie cluster info -p 700$i; done collie/collie cluster recover for i in 0; do ./collie/collie cluster info -p 700$i; done for i in 3 4; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done for i in 0 3 4; do ./collie/collie cluster info -p 700$i; done output: root@taobao:/home/dev/sheepdog# ./test2.sh Cluster status: Waiting for other nodes joining Creation time Epoch Nodes Cluster status: running Creation time Epoch Nodes 2011-10-22 02:18:49 2 [192.168.0.1:7000] 2011-10-22 02:18:49 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002] Cluster status: running Creation time Epoch Nodes 2011-10-22 02:18:49 4 [192.168.0.1:7000, 192.168.0.1:7003, 192.168.0.1:7004] 2011-10-22 02:18:49 3 [192.168.0.1:7000, 192.168.0.1:7003] 2011-10-22 02:18:49 2 [192.168.0.1:7000] 2011-10-22 02:18:49 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002] .... script2: for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done collie/collie cluster format for i in 0 1 2; do pkill -f "sheep -d /store/$i"; sleep 1; done # mater node 2 permanently down for i in 0 1; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done for i in 1; do ./collie/collie cluster info -p 700$i; done collie/collie cluster recover -p 7001 for i in 1; do ./collie/collie cluster info -p 700$i; done for i in 0 3; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done for i in 0 1 3; do ./collie/collie cluster info -p 700$i; done output: Cluster status: Waiting for other nodes joining Creation time Epoch Nodes Cluster status: The sheepdog is stopped doing IO, short of living nodes Creation time Epoch Nodes Cluster status: running Creation time Epoch Nodes 2011-10-22 01:59:36 5 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7003] 2011-10-22 01:59:36 4 [192.168.0.1:7000, 192.168.0.1:7001] 2011-10-22 01:59:36 3 [192.168.0.1:7001] 2011-10-22 01:59:36 2 [192.168.0.1:7001, 192.168.0.1:7002] 2011-10-22 01:59:36 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002] ... Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit b5a00c8bc31bd19e5758c99efc13400e838905d1 Author: Liu Yuan Date: Mon Oct 24 17:19:16 2011 +0800 sheep: export update_epoch_log() Signed-off-by: Liu Yuan commit 1bc044b9716b8d0a19d627908eb53b81af2cb758 Author: Liu Yuan Date: Mon Oct 24 17:19:15 2011 +0800 collie: add manual recover subcommand for cluster Currently, the sheepdog cluster cannot get recovered for below conditions 1) the master node is physically down after the cluster crashes with different epoches during recovery. 2) some of nodes are physically down after the cluster is shutdowned during recovery. This patch add a manual recovery mechanism. With this patch, you can manually recover the cluster at any live node by: $ collie cluster recover and this prompt you some warning, then type "yes" or if you know well what you are doing, you can $ collie cluster recover -f [Use with Caution] This command will increment cluster epoch by 1! for 1) case, you need to try to start up the nodes in sequence for the first round until the master node is up, thanks to the mastership mechanism. If unfortunately not, you can simply run the recover command. After that, you can freely join other good nodes in. for 2) case, you'd better try to start up all the nodes to see if any of nodes get physically down. If any, unfortunately, you can simply run the recover command. Signed-off-by: Liu Yuan commit 12554a66a06c7a03596e3d3594d1b7c10fcc0b20 Author: MORITA Kazutaka Date: Thu Oct 20 22:40:21 2011 +0900 sheep: introduce sd_op_template When we want to add a new operation (SD_OP_xxxxx), it is not clear which codes we should modify. And in some cases, we need to modify codes everywhere to implement one operation. This is not a good design. This patch abstracts out Sheepdog operations into sd_op_template, and moves all the request processing codes to sheep/ops.c. The definition of sd_op_template is as follows: struct sd_op_template { enum sd_op_type type; int force; int (*process_work)(const struct sd_req *req, struct sd_rsp *rsp, void *data); int (*process_main)(const struct sd_req *req, struct sd_rsp *rsp, void *data); }; 'type' is the type of the operation; SD_OP_TYPE_CLUSTER, SD_OP_TYPE_LOCAL, or SD_OP_TYPE_IO. 'force' is set to non-zero if the operations should be processed even when the cluster is not working. 'process_work()' and 'process_main()' are the main functions of this operation. process_work() will be called in the worker thread, and process_main() will be called in the main thread. If type is SD_OP_TYPE_CLUSTER, it is guaranteed that only one node processes a cluster operation at the same time. We can use this for something like distributed locking. process_work() will be called on the local node, and process_main() will be called on every nodes. If type is SD_OP_TYPE_LOCAL, both process_work() and process_main() will be called on the local node. If type is SD_OP_TYPE_IO, neither process_work() nor process_main() is used because this type of operation is heavily intertwined with Sheepdog core codes. We will be unlikely to add new operations of this type. Signed-off-by: MORITA Kazutaka commit 5d8ab0de8ee5383a7c5aa5d216ec3b4144d3b96e Author: Liu Yuan Date: Fri Oct 21 12:02:24 2011 +0800 sheep: fix the type of 'flags' in sheepdog_config Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 68d3af677553283b02db498100cd57234c585363 Author: Liu Yuan Date: Fri Oct 21 11:44:41 2011 +0800 sheep: rename SD_FLAG_CMD_DIRECT Change SD_FLAG_CMD_DIRECT into SD_FLAG_CMD_IO_LOCAL to make it more descriptive. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit ba6b2f6c79d27262429a7effa3c6f9ef568cd63a Author: MORITA Kazutaka Date: Fri Oct 21 09:49:45 2011 +0900 sheep: fix a cluster request race We must process all the cluster requests in cpg_work_queue. Otherwise req_done() may be called before __done(), which causes segmentation fault because the request is freed in req_done(). Signed-off-by: MORITA Kazutaka commit 6764d2062023c650fae330764bfa15031ef08045 Author: MORITA Kazutaka Date: Thu Oct 20 20:22:43 2011 +0900 sheep: avoid multicasting request data in vdi operations Signed-off-by: MORITA Kazutaka commit 3a4de821c4ee2847776b122e104e953ac8a3118e Author: MORITA Kazutaka Date: Thu Oct 20 18:16:39 2011 +0900 sheep: fix alignment of struct join_message Signed-off-by: MORITA Kazutaka commit 54cf1b03fc0eaaaad16f2e019ca17f4538c64ef8 Author: MORITA Kazutaka Date: Thu Oct 20 17:55:04 2011 +0900 sheep: reduce the size of join message Signed-off-by: MORITA Kazutaka commit 3f383418955212eb7ac8ea0047f5d2176ff0efcb Author: MORITA Kazutaka Date: Thu Oct 20 18:13:06 2011 +0900 sheep: remove message header Signed-off-by: MORITA Kazutaka commit 2882db3dc33fa81b2f8726bd0135dafd3ad83684 Author: Liu Yuan Date: Thu Oct 20 16:24:12 2011 +0800 sheep: rename get/set_global_nr_copies() Unify sheepdog_config APIs. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 84fb8b39ecad85df9d9c623ead6f7ca03f870dd3 Author: Liu Yuan Date: Thu Oct 20 16:24:11 2011 +0800 sheep: clean up group.c by sys_stat_* helpers And in passing fix one minor status check logic in update_cluster_info() for newly joined node. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit fcec75563f9f82796c111cd24bc48920278ab364 Author: Liu Yuan Date: Thu Oct 20 16:24:10 2011 +0800 sheep: make sys->stat operation bit-wise Add sys_stat_* helpers and higher level(sys_can_*) API based on them. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 2c95b2056535e8ffe3af51ff99b7baf7362b5969 Author: MORITA Kazutaka Date: Wed Oct 12 16:00:54 2011 +0900 sheep: stop checking the first joined node We don't need to care about the master node now. Signed-off-by: MORITA Kazutaka commit 8919fc3fbeea17dbb20d252727117ada9c2745c1 Author: MORITA Kazutaka Date: Wed Oct 12 15:16:13 2011 +0900 sheep: remove node_list join_handler() and leave_handler() can notify the node list which can be used for consistent hashing now, so we don't need to manage node lists in sheep/group.c any more. With this patch, 'collie node list' doesn't show the master node. But I think it is okay because there may be no master node if we use the other cluster driver than corosync. Signed-off-by: MORITA Kazutaka commit 9547dea18fd75030ad7480e5e2f3a9eac4f1ae0b Author: MORITA Kazutaka Date: Wed Oct 12 14:32:32 2011 +0900 sheep: remove message state There is no multi phases events now, so we don't need a message state. Signed-off-by: MORITA Kazutaka commit 143940b04a9345d2459f0bd915c2ebcf33f4e59e Author: MORITA Kazutaka Date: Wed Oct 12 14:28:40 2011 +0900 sheep: clean up struct join_message Signed-off-by: MORITA Kazutaka commit f1a0686f7966b057fad62fcbe847f6ed3f6ef1a3 Author: MORITA Kazutaka Date: Wed Oct 12 14:21:20 2011 +0900 sheep: remove joining flag We no longer receives any events during node joining, so we can remove the joining flag. Signed-off-by: MORITA Kazutaka commit c6e63fc650e96d26d5110b49624d3a8248ac2c34 Author: MORITA Kazutaka Date: Wed Oct 12 14:19:08 2011 +0900 sheep: remove notification message type We don't use cdrv->notify() for node membership management now, so we don't need to specify the type of message. Signed-off-by: MORITA Kazutaka commit dbfd09c987adf7c44c008ad3bff98bd22a78a84e Author: MORITA Kazutaka Date: Wed Oct 12 08:40:27 2011 +0900 sheep: move node membership management into cluster driver Currently, Sheepdog has two node lists; sd_node_list and cpg_node_list. The former is used for consistent hashing and seen from users. The latter is managed in the cluster driver and notified in join_handler/leave_handler. But this design is too complex. We should move all the cluster management stuff into the cluster driver. Main changes of this patch are as follows: - make join process one phase Node joining was really complex; cpg_confchg() notifies the newly joining node, the node multicasts a SD_MSG_JOIN message, and the master node receives it and multicasts the response. Moreover, we couldn't allow any I/O events during two multicasting. This patch moves all of them into the cluster driver. - add check_join_cb() to the join_handler() arguments This callback is called on one of the Sheepdog nodes (e.g. in the case of the corosync driver, the master server will call this). check_join_cb() checks whether the joining node may join the cluster, and returns the result. - use sheepdog_node_list_entry in the arguments of join_handler()/leave_handler() We can use the notified node list for consistent hashing now. Signed-off-by: MORITA Kazutaka commit 9d23d9da1aac7c2f4e2e726d331f4d94fc5e0698 Author: MORITA Kazutaka Date: Fri Oct 7 23:59:04 2011 +0900 sheep: use block_cb for vdi operations This patch uses vdi_op() as a blocking callback of notification. We no longer need to consider that other notifications (vdi operations or membership changes) are delivered during vdi operations. Signed-off-by: MORITA Kazutaka commit 08a2a0783a27c36b67066c34be39ebad2f988c93 Author: MORITA Kazutaka Date: Sun Oct 9 02:22:37 2011 +0900 cluster: add blocking mechanism to notification Currently Sheepdog vdi operations (create/delete/lookup/...) are processed in two phase multicasting: 1. multicasts a vdi request 2. only the master node handles the request and multicasts the response During this two phase, we cannot allow any other vdi operations and membership changes, and this makes sheep/group.c a bit hard to read. This patch simplifies this by adding a blocking callback to the notification function in the cluster driver. If the caller of cdrv->notify() sets 'block_cb' as an argument, block_cb() is called from the cluster driver before the message is notified to any node. All the cluster events are blocked in every nodes until the caller finishes the vdi operation in block_cb(). With this change, the master node is no longer in charge of vdi operations, but this is a good change to make Sheepdog more symmetric. Signed-off-by: MORITA Kazutaka commit b46f88705580ac07439d041b90a9173b9cf851f9 Author: MORITA Kazutaka Date: Fri Oct 7 18:18:13 2011 +0900 cluster/corosync: link corosync events to list before dispatching them This is a preparation for blocking callback support. Signed-off-by: MORITA Kazutaka commit da8744cab287d0a9ab8cf50b030cc514d41f39bc Author: MORITA Kazutaka Date: Tue Oct 18 08:11:43 2011 +0900 collie: fix calculation of length in vdi_read()/vdi_write() Signed-off-by: MORITA Kazutaka commit 8d2a2f4213b767a2d6f78e41b10461d61dfb3940 Author: MORITA Kazutaka Date: Tue Oct 18 00:28:45 2011 +0900 sheep: show error reason when SD_RES_EIO happens Signed-off-by: MORITA Kazutaka commit 55443ec8c362116c937c5132779dfe8e1c5ba8e2 Author: MORITA Kazutaka Date: Tue Oct 18 00:13:12 2011 +0900 collie: retry read()/write() when it returns smaller value This ensures the buffer length will be block-aligned. Signed-off-by: MORITA Kazutaka commit 8efc7b16dc93dd88da016232aa267d8b57b7c92f Author: MORITA Kazutaka Date: Tue Oct 18 00:05:16 2011 +0900 collie: make offset and len block-aligned in vdi_read()/vdi_write() Signed-off-by: MORITA Kazutaka commit b3cf84ae250fc2a1eea70a65c827ac1c106ede78 Author: MORITA Kazutaka Date: Fri Oct 14 01:00:39 2011 +0900 make vdi setattr atomic 'collie vdi setattr' runs the following two: - allocates a vdi attr object id - writes an attribute data to the object So a race can happen between these two operations. With this patch, 'collie vdi setattr' sends the attribute data when allocating the object id to write the attribute atomically. Signed-off-by: MORITA Kazutaka commit 15fdc00f999b9366151d7d3d19e7dca9edcf72e7 Author: MORITA Kazutaka Date: Tue Oct 11 14:30:24 2011 +0900 add vdi creation time to the vdi attribute header Currently, Sheepdog doesn't clean up vdi attributes when we delete the vdi. So if we create a new vdi whose name was used before, we could read the old attribute. This patch adds a vdi creation time to the attribute header and checks whether the attribute belongs to the current vdi. Note that this is a transitional approach. In future, we should remove vdi attributes when we remove the vdi. Signed-off-by: MORITA Kazutaka commit 047993a41cd8b4b32fedd52b8cdd534ddfe825b2 Author: Liu Yuan Date: Wed Oct 19 18:06:22 2011 +0800 sheep: use SD_STATUS_HALT to stop serving IO We use SD_STATUS_HALT to identify the cluster state when it should not serve IO requests. This is optional, users might risk themselves to turn off this HALT status. As the below command: $ collie cluster format -H or $ collie cluster format --nohalt By default, this is enabled. [Test Case] [1] steps: for i in 0 1 2 3; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done ./collie/collie cluster format --copies=3; for i in 0 1; do pkill -f "sheep -d /store/$i"; sleep 1; done for i in 2 3; do ./collie/collie cluster info -p 700$i; done for i in 0 1; do ./sheep/sheep -d /store/$i -z $i -p 700$i; sleep 1; done for i in 0 1 2 3; do ./collie/collie cluster info -p 700$i; done output: Cluster status: The sheepdog is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] Cluster status: The sheepdog is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] Cluster status: running Creation time Epoch Nodes 2011-10-11 16:26:02 5 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 4 [192.168.0.1:7000, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 3 [192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 2 [192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] 2011-10-11 16:26:02 1 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002, 192.168.0.1:7003] ... [2] steps: for i in 0 1; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done collie/collie cluster format for i in 0 1; do collie/collie cluster info -p 700$i;done for i in 0; do pkill -f "sheep/sheep -d /store/$i"; sleep 1; done for i in 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done for i in 1 2; do pkill -f "sheep/sheep -d /store/$i"; sleep 1; done for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done for i in 0 1 2; do sheep/sheep -d /store/$i -z $i -p 700$i;sleep 1;done for i in 0 1 2; do collie/collie cluster info -p 700$i;done output: Cluster status: The sheepdog is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-16 18:11:07 1 [192.168.0.1:7000, 192.168.0.1:7001] Cluster status: The sheepdog is stopped doing IO, short of living nodes Creation time Epoch Nodes 2011-10-16 18:11:07 1 [192.168.0.1:7000, 192.168.0.1:7001] Cluster status: running Creation time Epoch Nodes 2011-10-16 18:11:07 6 [192.168.0.1:7000, 192.168.0.1:7001, 192.168.0.1:7002] 2011-10-16 18:11:07 5 [192.168.0.1:7000, 192.168.0.1:7002] 2011-10-16 18:11:07 4 [192.168.0.1:7002] 2011-10-16 18:11:07 3 [192.168.0.1:7001, 192.168.0.1:7002] 2011-10-16 18:11:07 2 [192.168.0.1:7001] 2011-10-16 18:11:07 1 [192.168.0.1:7000, 192.168.0.1:7001] ... Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 983159adddbd41c5900360c5da4a42db49c1b7c1 Author: Liu Yuan Date: Wed Oct 19 20:38:26 2011 +0800 sheep: add set/get_cluster_status() Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 5df97d5206764175099ff08459b6402adbe17179 Author: Liu Yuan Date: Wed Oct 19 18:06:20 2011 +0800 sheep: add a helper function to get nr_zones Get number of zones from the list. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 9b6102ce4e0e95bd470710ac05a09bc012e6aa77 Author: Liu Yuan Date: Wed Oct 19 18:06:19 2011 +0800 sheep: introduce SD_STATUS_HALT Currently, sheepdog will serve IO requests even if number of nodes is less than 'copies'. When the number of the nodes (or zones) is less than the copies specified by collie-cluster-format command, the sheepdog cluster should stop serving IO requests. This is necessary to solve the below subtle case: + good nodes, - failed nodes. 0 1 2 3 + - - + + --> - --> - --> + + + - # <-- permanently down. ^ | this node has the latest data at stage 3, we will have a cluster recovered without the data tracked at stage 1. When the nodes are in the SD_STATUS_HALT, the sheepdog can also serve configuration change and do the recovery job. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c2fd032afead5e841f225eb3d8c8859f36548c64 Author: Liu Yuan Date: Wed Oct 19 18:06:18 2011 +0800 sheep: remove wrong 'unknown message' in the log Add a swtich case for the __sd_notify_done() to remove the wrong log message. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit e531283897b678ad94e03eb89cad0bc54f6abb58 Author: Liu Yuan Date: Wed Oct 19 18:06:17 2011 +0800 sheep: refactor get_cluster_status() And add a helper func to do the sanity check for cluster status. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 2fe6ffeae94d8e992af633af17927eb885e960b6 Author: Liu Yuan Date: Wed Oct 19 18:06:16 2011 +0800 sheep: add a helper epoch_log_read_nr() Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit b2f37293f74b9dedff324aad07465312d5af46ab Author: MORITA Kazutaka Date: Sat Oct 15 13:08:10 2011 +0900 use strtol() instead of atoi() for error check Signed-off-by: MORITA Kazutaka commit 670cc9ef39dcc533a2291f6fbf91f02e9d97f598 Author: MORITA Kazutaka Date: Sat Oct 15 13:22:44 2011 +0900 collie: exit when we cannot get node list Signed-off-by: MORITA Kazutaka commit 49f1030cc0b4e56b8484d9da282063dbce274b9d Author: MORITA Kazutaka Date: Sat Oct 15 12:08:32 2011 +0900 logger: remove printk-style log level check Signed-off-by: MORITA Kazutaka commit 8160b5224856442d8f28570136d4c190c196ab38 Author: MORITA Kazutaka Date: Mon Oct 17 22:28:53 2011 +0900 sheep: remove xattr dependency This patch add a config file to the Sheepdog store directory, and save Sheepdog cluster info to it instead of extended attributes. We need to update a config file atomically, so we use journaling for the write operations. Signed-off-by: MORITA Kazutaka commit 9ed2ceda6ee4107a3656a69a693cbdbc0f8ee4cf Author: MORITA Kazutaka Date: Mon Oct 17 19:05:35 2011 +0900 sheep: make jounaling functions support normal files The current implementation of Sheepdog journaling only supports objects. This patch extend it. Signed-off-by: MORITA Kazutaka commit 737596b4553baf97697c6996285680c52fcf375a Author: Andy chen Date: Fri Oct 14 10:55:42 2011 +0800 logger: refactor the vprintf() function refactor vprintf() from vprintf(fmt, args) to vprintf(int, fmt, args) to compatible with other user space program. Signed-off-by: Andy chen Signed-off-by: Yibin Shen Signed-off-by: MORITA Kazutaka commit 06d5447d02e51a756176c5cf3904fbf702bcc272 Author: Andy chen Date: Fri Oct 14 10:55:41 2011 +0800 logger: add sheep log level help func Signed-off-by: Andy chen Signed-off-by: Yibin Shen Signed-off-by: MORITA Kazutaka commit 43aa024836e314b905801f041add3fbd06896f80 Author: Andy chen Date: Fri Oct 14 10:55:40 2011 +0800 logger: redefine sheep log priorities make sheep log priorities compliant with syslog spec Signed-off-by: Andy chen Signed-off-by: Yibin Shen Signed-off-by: MORITA Kazutaka commit 755a82f41ea552a2c1b977e46f26d44c2408a97d Author: Andy chen Date: Fri Oct 14 10:55:39 2011 +0800 logger: fix log level not work problem now if we set log level when start sheep, the log level not work, this patch fix this Signed-off-by: Andy chen Signed-off-by: Yibin Shen Signed-off-by: MORITA Kazutaka commit bce15bd8bf57aa44b89ca07215c15810c67bcb6b Author: Andy chen Date: Fri Oct 14 10:55:38 2011 +0800 logger: reset openlog option if openlog option set to '0', the result is undefined, so set the openlog() option to 'LOG_CONS | LOG_PID' Signed-off-by: Andy chen Signed-off-by: Yibin Shen Signed-off-by: MORITA Kazutaka commit 16d09d28e926c600d47c24207dec87b77949a64e Author: Andy chen Date: Mon Oct 10 17:03:44 2011 +0800 use 24 hour system instead of 12 hour system Original log use 12 hour system, but not use time foramt parameter '%p' to print AM/PM, this patch fix it Signed-off-by: Andy chen Signed-off-by: MORITA Kazutaka commit 26ab1769f4eac94b22900298db37c0c9101270b5 Author: MORITA Kazutaka Date: Tue Oct 4 20:19:03 2011 +0900 cluster: add check for cluster driver definition Signed-off-by: MORITA Kazutaka commit c4357c7f0e4ded86f6a1dcbcffeccccf7cf59307 Author: MORITA Kazutaka Date: Tue Oct 4 17:35:24 2011 +0900 sheep: use cluster driver This patch removes all corosync stuff from group.c, and uses a cluster driver instead of it. Signed-off-by: MORITA Kazutaka commit 338e04a3c1a8ba679761860c41fed922e08d1b21 Author: MORITA Kazutaka Date: Tue Oct 4 17:29:46 2011 +0900 sheep: make first_cpg_node local There is no need to define this variable in struct work_confchg. Signed-off-by: MORITA Kazutaka commit f4492d89c6e0d76579962ceda47cabc9c01d752e Author: MORITA Kazutaka Date: Tue Oct 4 17:27:14 2011 +0900 cluster: add corosync driver Signed-off-by: MORITA Kazutaka commit 791deaf477069a009ff4e92d453dc75a1c0ff71f Author: MORITA Kazutaka Date: Tue Oct 4 17:12:46 2011 +0900 sheep: remove corosync nodeid This patch uses a generic sheepid instead of a corosync-specific node id. This patch is necessary to remove the dependency on corosync. Signed-off-by: MORITA Kazutaka commit 0671d03b19bbbdab69e64b111805235b6fb8bd50 Author: MORITA Kazutaka Date: Tue Oct 4 15:57:48 2011 +0900 sheep: define cluster driver interface This patch abstracts out a cluster management of Sheepdog, and introduces a cluster driver interface. Signed-off-by: MORITA Kazutaka commit bc0ab5204f8d40b69d787b27272f1e5635bdf565 Author: MORITA Kazutaka Date: Wed Oct 5 23:27:33 2011 +0900 sheepdog 0.2.4 Signed-off-by: MORITA Kazutaka commit 196449160ea22195c4da8079136eeb5d5348ba28 Author: MORITA Kazutaka Date: Fri Sep 30 03:43:59 2011 +0900 move panic() to logger.h panic() should be available for all who can use logger. Signed-off-by: MORITA Kazutaka commit 20bb9a71cfdede5b2ee20347f3b0710980331cea Author: Andy chen Date: Thu Sep 29 14:34:49 2011 +0800 only the joined and left nodes have reason Signed-off-by: Andy chen Signed-off-by: MORITA Kazutaka commit 73076a410ad36aa38e049ffee6970a7fa4e510f9 Author: Yibin Shen Date: Thu Sep 29 13:46:18 2011 +0800 sheep: cpg_local_get() should be called before cpg_join() Signed-off-by: Yibin Shen Signed-off-by: MORITA Kazutaka commit 0f7d1264355f2fdc55d8a19e095ed064a9668e83 Author: Yibin Shen Date: Wed Sep 28 18:11:56 2011 +0800 sheep: rename cpg_* function's return value from CS_ prefix to CPG_ Signed-off-by: Yibin Shen Signed-off-by: MORITA Kazutaka commit 83f0ccd67279dc9bed2c48c42f485e020702f017 Author: Yibin Shen Date: Wed Sep 28 18:11:55 2011 +0800 sheep: check return value for cpg_* function Signed-off-by: Yibin Shen Signed-off-by: MORITA Kazutaka commit 618679e39b417c8798d75259b327ec4ad7b0e56a Author: Liu Yuan Date: Wed Sep 28 15:34:44 2011 +0800 sheep, sdnet: use is_myself() helper to check target node Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 3b2c291a338610b975395a3523464def5ab705e5 Author: MORITA Kazutaka Date: Sat Sep 24 18:05:40 2011 +0900 add git based versioning Signed-off-by: MORITA Kazutaka commit 9105dea6a75f9da2b90779119424e6162dcbcb8a Author: Liu Yuan Date: Mon Sep 26 18:57:25 2011 +0800 sheep: tame sheep to recover the crash cluster [Rationale] Currently, we have to start up the frist failed node or last failed one to recover the crash cluster (nodes with different epoch histories). This patch simply remove this disgusting constraint. This patch addes a new concept in the crash cluster recovery phase: mastership transfer. The master node in the sheepdog is supposed to reply the nodes' join requests, hence managing other nodes join. It is the first node in the join stage and the last node in the crashed cluster. When we recover the crash cluster, the mastership is transfered one another until the last failed node is started up. When the node is not the last failed one, one of the two nodes simplely exits, only one node will be left in the recover stage with the mastership. After the last failed node is started up, we can join others safely with consistent epoch histories. With this patch, there is no start-up order imposed for the crash cluster to recover. As side effect, epoch transfer of leave node is removed. Leave node concept is changed a bit during crashed cluster recovery, that it is defined as "nodes that are supposed to leave and contained in the latest epoch". [Test Cases] The methods that I have tried to test this idea: $ for i in 0 1 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done $ collie/collie cluster format $ for i in 0 1 2; do pkill -f "sheep /store/$i"; sleep 1; done $ for i in 1 0 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done $ for i in 0 2; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done $ for i in 0 1 2; do ./collie/collie cluster info -p 700$i; done ... 2011-09-25 11:38:17 5 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7002] 2011-09-25 11:38:17 4 [192.168.0.4:7000, 192.168.0.4:7002] 2011-09-25 11:38:17 3 [192.168.0.4:7002] 2011-09-25 11:38:17 2 [192.168.0.4:7001, 192.168.0.4:7002] 2011-09-25 11:38:17 1 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7002] ... And Kazutaka's gorgous test case: for i in 0 1; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done ./collie/collie cluster format for i in 2 3 4; do pkill -f "sheep /store/$((i - 2))" ./sheep/sheep /store/$i -z $i -p 700$i sleep 1 done for i in 3 4; do pkill -f "sheep /store/$i"; sleep 1; done for i in 0 1 2 3 4; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done # now master 4 is recovered for i in 0 1 2 3; do ./sheep/sheep /store/$i -z $i -p 700$i; sleep 1; done for i in 0 1 2 3 4; do ./collie/collie cluster info -p 700$i; done Cluster status: running Creation time Epoch Nodes 2011-09-25 11:47:49 12 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7002, 192.168.0.4:7003, 192.168.0.4:7004] 2011-09-25 11:47:49 11 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7002, 192.168.0.4:7004] 2011-09-25 11:47:49 10 [192.168.0.4:7000, 192.168.0.4:7001, 192.168.0.4:7004] 2011-09-25 11:47:49 9 [192.168.0.4:7000, 192.168.0.4:7004] 2011-09-25 11:47:49 8 [192.168.0.4:7004] 2011-09-25 11:47:49 7 [192.168.0.4:7003, 192.168.0.4:7004] 2011-09-25 11:47:49 6 [192.168.0.4:7003] 2011-09-25 11:47:49 5 [192.168.0.4:7002, 192.168.0.4:7003] ... All is done with good and consistent epoch histories as expected. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 2e0bbfe2e6a1c6c2be2b91d195b61d50c850ff8f Author: Liu Yuan Date: Mon Sep 26 18:57:24 2011 +0800 sheep: add SD_MSG_MASTER_TRANSFER message This is sugguested by Kazutaka for crash cluster recovery to remove the timing race casued by leave message. This race was found by Kazutaka's below script: #!/bin/bash # create a directory which has a different creation time sheep /store/1 -p 7001 sleep 1 collie cluster format -p 7001 collie cluster shutdown -p 7001 sleep 1 # start Sheepdog sheep /store/0 -p 7000 sleep 1 collie cluster format -p 7000 while true; do sheep /store/1 -p 7001 sheep /store/2 -p 7002 # wait for node join while [ "`collie cluster info -p 7002 -r 2>&1 | head -1`" != 'running' ]; do sleep 0.1 done if [ "`collie node list -p 7002 -r | wc -l`" -ne 2 ]; then # break if the result is not correct break fi pkill -f "sheep /store/2" done # show results collie cluster info -p 7000 collie cluster info -p 7002 Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c37dbddcb76c578938a95b3dee7e826d3eea910c Author: Liu Yuan Date: Mon Sep 26 18:57:23 2011 +0800 sheep: add helpers for message handling Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit cb986e4423a4a48763ee4d170427d56c8aacade4 Author: Liu Yuan Date: Sat Sep 24 12:14:54 2011 +0800 sheep: rename find_leave_node into find_entry_list() Let's differentiate between find_node() and find_entry_*() Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 72cd67cd8db8ab6387870253ffc598f95fa60d07 Author: Liu Yuan Date: Sat Sep 24 12:14:53 2011 +0800 sheep: make add_node_to_leave_list() more readable Replace the switch with more proper if-else clause. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 3b994457488bc71736bb77271f73b039d73d71f8 Author: Liu Yuan Date: Sat Sep 24 12:14:52 2011 +0800 logger: move init_base_path() ahead of init_log() We need set up base dirs for the first place, otherwise sheep daemon will exit when there are no directories created beforhand. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 46da50f67c6b065f14eaf96f429bc0ad2163f15a Author: Liu Yuan Date: Thu Sep 22 16:43:26 2011 +0800 logger: fix error return path We should return immediately when 'open' error-returns. This will make dangling logger process disappear in error case. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 066d75396f982bfe06f1222f8765de1e957f4f27 Author: Liu Yuan Date: Wed Sep 21 18:14:01 2011 +0800 sheep: get consistent cluster information on each node Currently, we just try to get cluster information from local epoch history. We coudn't get the full history when local epoch log doesn't has requested epoch version, resulting in displaying inconsistent epoch history. This patch add a new function to read epoch log remotely and we also add a private sheepdog operation (SD_OP_GET_EPOCH) to achieve the objective. When any of the nodes in the cluster doesn't has the requested epoch, we just display null string like following: root@taobao:/home/dev/sheepdog# collie/collie cluster info Cluster status: running Creation time Epoch Nodes 2011-09-21 17:58:15 7 [192.168.0.1:7000, 192.168.0.2:7000] 2011-09-21 17:58:15 6 [192.168.0.1:7000, 192.168.0.2:7000, 192.168.0.3:7000] 2011-09-21 17:58:15 5 [192.168.0.1:7000, 192.168.0.2:7000] 2011-09-21 17:58:15 4 [192.168.0.1:7000] 2011-09-21 17:58:15 3 [] <---- null string 2011-09-21 17:58:15 2 [192.168.0.2:7000, 192.168.0.3:7000] 2011-09-21 17:58:15 1 [192.168.0.1:7000, 192.168.0.2:7000, 192.168.0.3:7000] Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 672cd8933532334d1586fdfe8ffb5e9fabac9404 Author: Liu Yuan Date: Wed Sep 21 18:14:00 2011 +0800 sheep: teach sheepdog to better recovery the cluster [Problem] Currently, sheepdog cannot recovery cluster into full functional state if any node in the cluster fails to join cluster after shutdown, because of being considered unhealthy (F.g. the targeted epoch content is corrupted, epoch version mismatch). That is, the cluster can only get worked again only *if* all the nodes join the cluster successfully. For 14 nodes in the cluster, ==========*=== <--- cluster refuses to work ^ | unhealthy node This is quite awkward. The cluster with many nodes after being shutdowned, we easily meet the condition that some of nodes are unhealthy that are rejected by the master during join stage.This patch gives sheepdog some kind of intelligence to deal with unhealthy nodes and process to recovery when all the nodes alive reach the agreement. [Design] This patch add a new concept into sheepdog, the *leave node*. The _leave node_ is the one that the master checks and finds it unhealthy (unmatched epoch content), so marks it as 'leave node', meaning that it is supposed to leave the cluster. The leave nodes are queued in the leave list, *only* exist during SD_STATUS_WAIT_FOR_JOIN. All the leave nodes stop sheep itself automatically after being started. The key idea for *when* the node reach the agreement is very simple, can be summed up into one equation: nr_nodes_in_epoch == nr_nodes_in_sd_list + nr_nodes_in_leave_list When this is reached, all the nodes alive in the cluster will begin to recovery and finally with the epoch incremented by 1, if some other nodes are considered to leave. This is because, after the cluster recovery, we can then try *re-join* the unhealth nodes and probably succeed! [Cases] There is no order imposed to start up the nodes in the cluster. That is, you can start up the nodes after shutdown in arbitrary order, whether it is good node or bad node (Yes, you cannot know node status before you try to start it up). For e.g ====** <-- the cluster will recovery with 4 nodes alive with epoch incremented by 1 ==***= <-- the cluster will recovery with 3 nodes alive with epoch incremented by 1 =***** <-- the cluster will recovery with 1 nodes alive with epoch incremented by 1 |* try to re-join v =*==== --> ====== <- we might get 6 nodes alive. The corner case is that we start one of the bad nodes first before any healthy nodes. If this happens, all the other nodes are considered 'unhealthy', then the very one node will recovery after we try to join all the nodes. *===== --> the cluster will recovery with 1 nodes alive with epoch incremented by 1 ^ | from now on, we can re-join the nodes, and will end up with 6 nodes alive. [Note] This patch changes join_message layout! So you have to recompile all the sheepdogs in your cluster before it works out. We need to modify the join_message layout, because of the following scenario: =**=== <-- the last 3 nodes have to need know previous 2 leave nodes information. When the nodes following leave nodes join, the master is supposed to reply with leave nodes information via send_join_response(). Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 42b49e99e324199e709b8b339b9885e66ab43d12 Author: Liu Yuan Date: Wed Sep 21 18:13:59 2011 +0800 sheep: add a new helper to update epoch log This is intended for later patch set use to reduce duplicate code. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 8c940bbc2e2d45b2ef340e0275b64f12c0c80199 Author: MORITA Kazutaka Date: Sat Sep 17 17:14:54 2011 +0900 sheep: fix a memory leak If a connection is closed before finishing sending data, free_request() is not called against it. This patch triggers client_tx_handler() even if the connection is closed, and calls free_request() in it. Signed-off-by: MORITA Kazutaka commit bc1be9029e617a9b061ae9c5858e87e50105f564 Author: Liu Yuan Date: Sun Sep 18 11:33:30 2011 +0800 sheep: prefix print_node_list macro's local variable The code are very likey to use 'name' and 'node' as its own local variable. We need to avoid possible name collision in general macro. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit d35f477fee63fed0bedae8debe52e450a7804ac0 Author: Liu Yuan Date: Sun Sep 18 11:33:29 2011 +0800 sheep: add two helpers to get nodes number from list and epoch get_nodes_nr_epoch() is supposed for later patch set use. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 104efb0c4ba989e8fb50f7e2c85eeab6d892aea4 Author: Liu Yuan Date: Sun Sep 18 11:33:28 2011 +0800 sheep: move eprintf in update_cluster_info() If node is already joined, we lose the changce to see log information. So move it forward in a better place. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit d74be024ffeb78c6b887f625e2229d1d742c83b1 Author: Liu Yuan Date: Sun Sep 18 11:33:27 2011 +0800 sheep: initialize ret in fix_object_consistency() GCC whines about this, so quiet it. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 77f26b4d8aad7001c40ca88194e9f1683ebda332 Author: Liu Yuan Date: Sun Sep 18 00:56:39 2011 +0800 sheep: get max_logs right in cluster_queue_request() The calculation for max_logs is wrong and this would result in Segmentatoin Fault when querying cluster epoch information by 'collie cluster info' Reported-by: Shawn Moore Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 35f8682e250c925773718b5e148681cdf7a6b02f Author: Liu Yuan Date: Fri Sep 16 11:35:24 2011 +0800 sheep: unify nodeid format in debug info Some are in decimal, cause much inconvenience. So unify them into hex. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 1a1313951b240560832adbb639349def47c21b8e Author: Liu Yuan Date: Fri Sep 16 11:35:23 2011 +0800 sheep: remove unused code in __fill_obj_list() Remove it as the comment suggests. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit dc9260f674b8f5454741752895563a0a9937c172 Author: MORITA Kazutaka Date: Thu Sep 15 13:10:29 2011 +0900 sheep: list local objects efficently This also fixes a buffer overflow problem which occurs when there are many epochs. Signed-off-by: MORITA Kazutaka commit 3a2801bfc381f99863d4bc196d9a06c437bbb9b4 Author: Liu Yuan Date: Tue Sep 13 16:35:26 2011 +0800 sheep: get vdi bitmap logic right in join phase For newly added nodes, they cannot get old vdi information that are created before it joins. This is because in the join phase, we don't get vdi bitmap right and just get vdi bitmaps from nodes that are in new node's sd_list. So the correct way is: (as MORITA Kazutaka suggested) 1 call get_vdi_bitmap_from_sd_list() to get bitmaps from ndoe's sd_list. 2 call get_vdi_bitmap_from(msg->from) to get bitmap from the delivery node. 3 call get_vdi_bitmap_from(msg->nodes) to get bitmaps from the old configuration. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 48386de1ba697adb53f799d2f77195240335f63b Author: Liu Yuan Date: Tue Sep 13 16:35:25 2011 +0800 sheep: add get_vdi_bitmap_from(node) This patch adds a new function, which get vdi bitmap from the targeted node. Rename get_vdi_bitmap_from_all() into get_vdi_bitmap_from_sd_list() to make it more accurately described by name. As side effect, fix some error return paths in get_vdi_bitmap_from_all(). Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 0158fc035b3c4b94a70177bdf8a35ccf4c93327c Author: Liu Yuan Date: Sun Sep 11 01:26:05 2011 +0800 sheep, sdnet: add a dprintf for queue_request() Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit f821f145be1672881d76982dc4e9d65a8aeb3f00 Author: Liu Yuan Date: Sun Sep 11 01:26:04 2011 +0800 sheep: put init_store() in the proper place init_store() calls init_epoch_path(), which relys on the log utility to do the logging. Howevery, log_init() is called after init_store(). So if we enable debug mode, it will leak some of the debug information on standard output that otherwise is expected in the log file. This patch makes logger behave itself. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c2c3f4b15c6e6034cf1caee0127d0b73d7d77f0e Author: MORITA Kazutaka Date: Fri Sep 2 00:37:32 2011 +0900 sheep: handle CPG_EVENT_DELIVER even if there are outstanding I/Os This patch prevents VM I/Os from blocking a CPG_EVENT_DELIVER event. Signed-off-by: MORITA Kazutaka commit f2d8f853e3d04d9396542f00c4366bb360ad8bd9 Author: MORITA Kazutaka Date: Fri Sep 2 00:33:54 2011 +0900 sheep: handle CPG_EVENT_REQUEST even if CPG_EVENT_DELIVER exists This patch prevents a CPG_EVENT_DELIVER event from blocking VM I/Os. Signed-off-by: MORITA Kazutaka commit aa40151cc6e4b004fabc786e8d5250e714b7c65b Author: MORITA Kazutaka Date: Thu Sep 1 22:35:10 2011 +0900 sheep: use multiple work queues Currently, Sheepdog uses only one work queue. So if many VMs send a forwarding requests to sheep daemons at the same time and they consume all the worker threads, it will cause freeze of Sheepdog. The simplest way to solve this problem is that Sheepdog uses multiple work queues and puts forwarding requests and I/O requests into different queues. This also solves the problem that sheep daemons use too many socket descriptors on the large cluster environment. Signed-off-by: MORITA Kazutaka commit 5bf919f26c68ed9aee2f4de343b30c9917101a54 Author: MORITA Kazutaka Date: Thu Sep 1 18:27:10 2011 +0900 sheep: fix a wrong error check of exec_req() Signed-off-by: MORITA Kazutaka commit e8a9c5f2ae6dec1ac85de825f03925c08857976d Author: MORITA Kazutaka Date: Thu Sep 1 18:00:52 2011 +0900 sheep: allow zone id to be zero Currently, the default zone id is the corosync node id, so zero is not a special number. Signed-off-by: MORITA Kazutaka commit fe0cfa7a84de96a6c38e52a16138d19d471a323e Author: MORITA Kazutaka Date: Thu Sep 1 17:14:16 2011 +0900 sheep: remove object list file Sheepdog creates a object list file (*.list) when the epoch is changed. This file is used for the response of SD_OP_GET_OBJ_LIST. But when many nodes are added to the cluster at the same time, it takes a long time to create a many list files, and SD_OP_GET_OBJ_LIST can result in a timeout error. This patch completely removes the object list file from Sheepdog. In the response of SD_OP_GET_OBJ_LIST, sheep simply calls readdir() on the store directory, and lists all the object id stored on the local node. Signed-off-by: MORITA Kazutaka commit 713b82fde75f363e2d6bb13d055207245c4437a5 Author: MORITA Kazutaka Date: Thu Sep 1 13:57:01 2011 +0900 sheep: setup node_list_entry before starting object recovery This avoids redundant disk access and simplifies recovery codes. Signed-off-by: MORITA Kazutaka commit 13a471f68dcc4e94a45f40038e11e5a2fe992884 Author: MORITA Kazutaka Date: Thu Sep 1 02:44:53 2011 +0900 sheep: find vdis from old epoch directories too The latest epoch directory may not contain all the vdi objects if the node crashed before completing the object recovery. To avoid data loss, it is better to find vdi objects from older epoch directories too. Signed-off-by: MORITA Kazutaka commit 4dc365c4e0e263ecb2a88c8b854a2a35e362c8a1 Author: Liu Yuan Date: Wed Aug 31 17:44:59 2011 +0800 sheep: do the sanity check to mute gcc Gcc complains about ftruncate() usage, so mute it. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit a8f2c17c0ca2e70b1d471c4c46a9906d6c75e5e0 Author: Liu Yuan Date: Wed Aug 31 17:44:58 2011 +0800 sheep: remove unused code in __start_recovery() fill_obj_list() sets rw->retry to 1 only when it is in error case, that will result code jumping into 'fail' label. So the code will always skip the rw->retry checking. This patch removes unused lines. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit c967a037b7bd9c394b01afb6b863e00bafc90cf0 Author: Liu Yuan Date: Wed Aug 31 17:44:57 2011 +0800 sheep: add a dprintf for store_queue_request_local Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 438f06f909b2d5d8105ce3424f6f8ada3f2da6b6 Author: Liu Yuan Date: Wed Aug 31 15:23:13 2011 +0800 sheep, journal: check ret before calling into jrnl_handlers We should do this sanity check to avoid possible segmentation fault or further wrong code path, because in error case, jrnl_type is incorrect. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit 334f0462050db72fa1a3488ff14fc25eb0a6a3fb Author: Liu Yuan Date: Mon Aug 29 14:36:49 2011 +0800 sheep: exit when corosync stops running unexpectedly When the corosync dies, sheep will recive a signal but currently ignores it. If corosync restarts, we have to manually kill sheep and restart it to work with corosync. By simplicity, sheep should exit when corosync dies. Other policy like handling corosync re-join would be considered later. Signed-off-by: Liu Yuan Signed-off-by: MORITA Kazutaka commit ed24aea15210b87d16baf5adcddf9616b9c50d32 Author: MORITA Kazutaka Date: Sun Aug 14 18:10:15 2011 +0900 logger: set SEM_UNDO for semget() If sheep terminates during a logging operation, the sheep logger cannot get semaphore any more and cannot exit in log_sigsegv(). To resolve the problem, this patch sets SEM_UNDO for semget(). Signed-off-by: MORITA Kazutaka commit d21a46f9218f2e4ee0cf404f46d85e15f42579e0 Author: MORITA Kazutaka Date: Sun Aug 14 15:41:01 2011 +0900 sheep: avoid exiting directly when failing to initialize cluster Signed-off-by: MORITA Kazutaka commit 79a05ce41e921390b0d0ee0a771b12b2d686ae49 Author: MORITA Kazutaka Date: Sun Aug 14 15:24:19 2011 +0900 flush logs before stopping logger Signed-off-by: MORITA Kazutaka commit 35a3965b25ebdb5b21a8bd1015620ec07f457520 Author: MORITA Kazutaka Date: Sun Aug 14 16:37:50 2011 +0900 sheep: stop logger process before terminating Signed-off-by: MORITA Kazutaka commit ea3f09bda1d5e3611a93fb8217da84ff6bbe7662 Author: Chris Webb Date: Sat Aug 13 11:56:22 2011 +0100 collie vdi write: make offset and length arguments optional Usage: collie vdi read [ []] [-a address] [-p port] [-h] If is not specified, we write from to the end of the vdi or EOF on STDIN, whichever is reached first. If is also not specified, we write from the start of the vdi. Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit 4503b60ccac49be574fbcdfeec1d9f1fe89ebc0b Author: Chris Webb Date: Sat Aug 13 11:56:21 2011 +0100 collie vdi write: do not try to write beyond end of vdi Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit 1c602d2996b4dfdd4cdfea31be5aad0bbbf0ed6a Author: Chris Webb Date: Sat Aug 13 11:56:20 2011 +0100 collie vdi write: stop trying to read STDIN on EOF Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit bb9ad20c1e5e044e2db0db2f76fcac52f97eddd7 Author: Chris Webb Date: Sat Aug 13 11:56:19 2011 +0100 collie vdi read: make offset and length arguments optional Usage: collie vdi read [ []] [-s snapshot] [-a address] [-p port] [-h] If is not specified, we read from to the end of the vdi. If is also not specified, we read the entire vdi. Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit a55ee6a26440f3f206734ad2e234748cd77ab19e Author: Chris Webb Date: Sat Aug 13 11:56:18 2011 +0100 collie vdi read: do not try to read beyond end of vdi Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit 28b04ed60c6e531675169c834dbeeb7eb66a3e48 Author: MORITA Kazutaka Date: Wed Aug 10 06:37:27 2011 +0900 collie: add support for writing data to the vdi This command reads data from the standard input, and writes the data to the Sheepdog virtual disk image. Note that Sheepdog doesn't allow concurrent write accesses from multiple clients; you cannot use this command when another VM uses the disk image. $ collie vdi write -h vdi write - write data to a image Usage: collie vdi write [-a address] [-p port] [-h] Command parameters: -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit Signed-off-by: MORITA Kazutaka commit 08ef1557e94a6221d0d945cb27f4e2630400fccc Author: MORITA Kazutaka Date: Wed Aug 10 01:45:42 2011 +0900 collie: add support for reading data from the vdi This command reads data from a vdi directly, and prints the data to the standard output. $ collie vdi read -h vdi read - read data from a image Usage: collie vdi read [-s snapshot] [-a address] [-p port] [-h] Command parameters: -s, --snapshot specify a snapshot id or tag name -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit Signed-off-by: MORITA Kazutaka commit bc582addb9684672c2b8ed40a7d7931226b39152 Author: MORITA Kazutaka Date: Wed Aug 10 00:59:05 2011 +0900 collie: split a collie.c into subcommand files collie.c becomes large and we expect that collie will support more commands in future, so let's split the code for readability. Signed-off-by: MORITA Kazutaka commit d5d4672002361613db9f8581b77751614f8229e9 Author: MORITA Kazutaka Date: Tue Aug 9 21:56:59 2011 +0900 sheep: use corosync nodeid as a default zone id value Usually, we don't want to replicate data in the same machine. This patch makes sheep daemons in the same node use the same zone id. If you want to assign a zone id manually (e.g. enable rack-aware data placement, emulate multiple nodes in one physical machine), specify it in the sheep command line options explicitly with a '-z' option. Signed-off-by: MORITA Kazutaka commit a44a7503a440f021e6b65b35fe9c430706014fca Author: MORITA Kazutaka Date: Tue Aug 9 21:27:21 2011 +0900 logger: initialize log_level It seems that we lost log messages which were printed before we call log_init(). Signed-off-by: MORITA Kazutaka commit 4b7b70e371ea509db2be188a91f343b9f7e72af0 Author: MORITA Kazutaka Date: Sat Aug 6 04:41:24 2011 +0900 collie: add vdi resize support $ collie vdi resize -h vdi resize - resize a image Usage: collie vdi resize [-a address] [-p port] [-h] Command parameters: -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit Signed-off-by: MORITA Kazutaka commit 8c14e57d8d4b2026bbc50c7bb8f662a12fdb294b Author: MORITA Kazutaka Date: Sat Aug 6 04:07:16 2011 +0900 collie: add vdi clone support $ collie vdi clone -h vdi clone - create a clone image Usage: collie vdi clone [-s snapshot] [-P] [-a address] [-p port] [-h] Command parameters: -s, --snapshot specify a snapshot id or tag name -P, --prealloc preallocate all the data objects -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit Signed-off-by: MORITA Kazutaka commit 16f40a10fef6d9078edb78ea5fce3b0c3363b217 Author: MORITA Kazutaka Date: Sat Aug 6 03:25:04 2011 +0900 collie: add vdi snapshot support $ collie vdi snapshot -h vdi snapshot - create a snapshot Usage: collie vdi snapshot [-s snapshot] [-a address] [-p port] [-h] Command parameters: -s, --snapshot specify a snapshot id or tag name -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit Signed-off-by: MORITA Kazutaka commit 08c7dd8cc6e99131da137a2b3fb7877b8f153aed Author: MORITA Kazutaka Date: Sat Aug 6 02:45:37 2011 +0900 collie: add vdi creation support $ collie vdi create -h vdi create - create a image Usage: collie vdi create [-P] [-a address] [-p port] [-h] Command parameters: -P, --prealloc preallocate all the data objects -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit Signed-off-by: MORITA Kazutaka commit b42afeaf0084e2db4c8042834822a3b4e51276c3 Author: MORITA Kazutaka Date: Sat Aug 6 01:14:14 2011 +0900 collie: add helper functions to read/write objects Signed-off-by: MORITA Kazutaka commit 701f59ef0aad0d97c9ddbfd147ab00a1fd67d700 Author: MORITA Kazutaka Date: Fri Aug 5 02:45:22 2011 +0900 collie: fix comparison of command names Signed-off-by: MORITA Kazutaka commit b15ea5acab365cd7ddd494ea8f2e195e242df0da Author: MORITA Kazutaka Date: Thu Aug 4 19:05:02 2011 +0900 collie: add subcommand usage description This patch adds subcomand help. Example: $ collie vdi setattr -h vdi setattr - set a vdi attribute Usage: collie vdi setattr [value] [-d] [-x] [-a address] [-p port] [-h] Command parameters: -d, --delete delete a key -x, --exclusive write in an exclusive mode -a, --address specify the daemon address (default: localhost) -p, --port specify the daemon port -h, --help display this help and exit Signed-off-by: MORITA Kazutaka commit b1dbe61c2acfd40a5f8bff3eb245b3446ffc4036 Author: MORITA Kazutaka Date: Thu Aug 4 17:08:22 2011 +0900 sheep: fix wrong alignment Signed-off-by: MORITA Kazutaka commit d8a767e4a4ab73b9b40658849a155c2401436dfe Author: MORITA Kazutaka Date: Thu Aug 4 17:07:07 2011 +0900 sheep: fix uninitialize variable Signed-off-by: MORITA Kazutaka commit dbaedcedba016244a1ef22ce59c5b448303ff055 Author: MORITA Kazutaka Date: Thu Aug 4 12:30:21 2011 +0900 sheep: fix wrong free order of the request structure req->r_siblings could be linked to the list_head member in req->ci, so we cannot free req->ci before freeing req. Signed-off-by: MORITA Kazutaka commit a085af491032c7891aeec4cb26b6f28eb79882e1 Author: MORITA Kazutaka Date: Thu Aug 4 12:17:04 2011 +0900 sheep: fix wrong use of object map Only data objects are related to the object map. Signed-off-by: MORITA Kazutaka commit fa6a5acd2690c3ac49a1ebfbf6e103b1e652d87c Author: MORITA Kazutaka Date: Thu Aug 4 12:12:50 2011 +0900 sheep: limit the number of object maps The object map is used when checking the consistency of data objects. This patch limits the number of the maps and save the consumption of a memory. Signed-off-by: MORITA Kazutaka commit b044bc5451044ebdd0db04d231b1c9560c8741b7 Author: MORITA Kazutaka Date: Thu Aug 4 11:38:32 2011 +0900 sheep: fix using large stack area Signed-off-by: MORITA Kazutaka commit df068bc84f01ecc438aefc5f7eb573e5443b8f50 Author: MORITA Kazutaka Date: Thu Aug 4 08:06:39 2011 +0900 sheep: allocate memory dynamically instead of statically Signed-off-by: MORITA Kazutaka commit 9ac88c3efa09fd51acce5eff7fdb0d00202b87cb Author: MORITA Kazutaka Date: Thu Jul 7 14:33:05 2011 +0900 add exits.h to the distribution archive Signed-off-by: MORITA Kazutaka commit b2781dfdac44b5f3da636a0ed790abd5897b6c2b Author: MORITA Kazutaka Date: Thu Jul 7 12:26:33 2011 +0900 sheep: fix uninitialized variable Signed-off-by: MORITA Kazutaka commit deffc0f289bcad124e9ad77152038887725358ae Author: MORITA Kazutaka Date: Tue Aug 2 17:12:57 2011 +0900 support location-aware data placement This introduces a zone to specify the location of sheep daemons. With this patch, you can do more flexible replication placement: - When storage servers have multiple disks and you run multiple sheep daemons on them, you can ensure that the data is replicated to multiple servers by specifying the same zone id to the daemons on the same server. If you don't specify the zone id, the data could be replicated to the disks of the same server. - You can replicate data to different racks by specifying the same zone id to the daemons in the same rack. To use this feature, specify a zone id in the sheep command line options. Example: $ sheep /store/0 -p 7000 -z 1 $ sheep /store/1 -p 7001 -z 1 $ sheep /store/2 -p 7002 -z 2 $ sheep /store/3 -p 7003 -z 2 $ sheep /store/4 -p 7004 -z 2 $ sheep /store/5 -p 7005 -z 3 $ collie cluster format -c 3 $ collie node list Idx - Host:Port Vnodes Zone ----------------------------------------- * 0 - 10.68.14.1:7000 64 1 1 - 10.68.14.1:7001 64 1 2 - 10.68.14.1:7002 64 2 3 - 10.68.14.1:7003 64 2 4 - 10.68.14.1:7004 64 2 5 - 10.68.14.1:7005 64 3 $ qemu-img convert 6g.raw sheepdog:test $ collie node info Id Size Used Use% 0 420 GB 3.1 GB 0% 1 420 GB 2.9 GB 0% 2 419 GB 2.3 GB 0% 3 419 GB 2.0 GB 0% 4 419 GB 1.7 GB 0% 5 423 GB 6.0 GB 1% Total 2.5 TB 18 GB 0%, total virtual VDI Size 6.0 GB In the above example, the same data is not replicated in the same zone, so the total used size of these zones are equal. Signed-off-by: MORITA Kazutaka commit c4231525945c2e75ff9d8fd9011d111a003639eb Author: MORITA Kazutaka Date: Sat Jul 30 01:56:11 2011 +0900 sheep: use hash value for vdi attribute object id The vdi id can change if we get the snapshot, so there is a problem that vdi attributes vanish after taking snapshots. This patch uses the hash value for the vdi attribute object id instead of the vdi id. Signed-off-by: MORITA Kazutaka commit 2f294d7c8ac88cccf47e5eac6f5540f2c1903680 Author: Chris Webb Date: Thu Jul 28 12:41:42 2011 +0100 Return EXIT_MISSING from vdi setattr and getattr when VDI is not found Previously, a general EXIT_FAILURE was returned in this case, which is hard to distinguish from other cluster failures. Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit 42339ff40d1d2d98de0263c2ddab36a321f2a621 Author: MORITA Kazutaka Date: Mon Jun 20 18:45:30 2011 +0900 sheep: abort when sheep cannot get the latest epoch Because any number could be used as an epoch number, it causes a problem to return -1 when error occurs. It is a fatal error if we cannot get the latest epoch number, so we should kill the process in this case. Signed-off-by: MORITA Kazutaka commit 9f41951ddfd4b6e05ece26ee84ffbf8dfe75a996 Author: MORITA Kazutaka Date: Mon Jun 20 02:21:48 2011 +0900 sheep: handle network partition failure This patch kills minority nodes when a network partition has occurred. Though this approach kills many nodes, it is the most simple way to keep strong consistency. Signed-off-by: MORITA Kazutaka commit 3410623d115a177e065250e60cb1fae0e575801f Author: Chris Webb Date: Sat Jun 18 19:28:49 2011 +0100 collie: fix struct get_vid_info to struct get_vdi_info Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit d4ce223e3dfdbea377b6011b3838706ad0be2987 Author: Chris Webb Date: Sat Jun 18 17:26:48 2011 +0100 collie: extend vdi list to allow listing a single VDI by name On a test cluster with three nodes and five hundred VDIs, the early exit optimisation introduced here reduces the time to find the size, used, and shared values from a specific VDI from around 3.4s (for naive grep of collie vdi list output) to around 2s, similar to a 'collie vdi object' command. However, is it possible to make this and other VDI operations properly constant- or log-time in the number of VDIs instead of linear? Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit b9859febe93d26ff3decc217f4f9ac460fd61439 Author: Chris Webb Date: Sat Jun 18 17:04:05 2011 +0100 collie: add --raw option for machine-readable output This option modifies the info and list displays to make them easier to parse reliably from C and shell scripts, separating each column with a single space character. We also display times in seconds since the epoch, print sizes in raw bytes rather than using units of MB, GB, TB, etc., and escape whitespace characters and backslashes in VDI names with a leading backslash escape. Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit 7146de6c8a52ca5afe20401c6e836dc1ef0a1e42 Author: Chris Webb Date: Thu Jun 16 17:54:38 2011 +0100 collie: use exit codes to distinguish between errors When integrating collie into cluster management scripts, it is useful to be able to tell the difference between different types of error without needing to parse human-readable error text. In addition to the standard EXIT_SUCCESS (0) and EXIT_FAILURE (1) exit conditions, we introduce EXIT_SYSFAIL (2) - something is wrong with the cluster or local host EXIT_EXISTS (3) - the object already exists so cannot be created EXIT_FULL (4) - no more space is left in the cluster EXIT_MISSING (5) - the specified object does not exist EXIT_USAGE (64) - invalid command, arguments or options and attempt to return these consistently for all collie commands. Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit 786e9c7d73ad785a218c658fd14e97e9f38e1261 Author: Chris Webb Date: Thu Jun 16 12:24:13 2011 +0100 Fix NOEDLIST -> NODELIST in collie.c Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit bc2cf0689be4a846cc8294e2747347f6b040a5e1 Author: Chris Webb Date: Thu Jun 16 12:12:39 2011 +0100 Fix typo in collie.c Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit cc7166a2b25b514f302522de97bd1dd6332f8437 Author: MORITA Kazutaka Date: Wed Jun 15 16:37:13 2011 +0900 sheep: return all the epoch histories in the response of SD_OP_STAT_CLUSTER This info is useful for resolving node inconsistency. Signed-off-by: MORITA Kazutaka commit 65cc10a18ba60a91b088650e08b80c031d406861 Author: MORITA Kazutaka Date: Wed Jun 15 01:11:08 2011 +0900 collie: the number of replication is at most the number of nodes This fixes setting/getting vdi attribute bugs which occurs when the number of redundancy is larger than the number of nodes Signed-off-by: MORITA Kazutaka commit cfe7605eb24536603c0afc2cb54d93c5d72aff04 Author: Chris Webb Date: Mon Jun 13 18:15:55 2011 +0100 collie: don't read unused value from STDIN for vdi setattr -d Signed-off-by: Chris Webb Signed-off-by: MORITA Kazutaka commit be5038c1894cefc3485caf4ec523e2b0f45999c1 Author: MORITA Kazutaka Date: Mon Jun 6 18:08:56 2011 +0900 sheep: gather vdi bitmap in worker thread We cannot wait in main thread, so we must call get_vdi_bitmap_from_all() in __sd_deliver() instead of __sd_deliver_done(). Signed-off-by: MORITA Kazutaka commit 2cda774833e4bbf4754485b620eb7b28ffaa8b07 Author: Steven Dake Date: Sun Jun 5 15:10:19 2011 -0700 Update sheepdog.spec.in to match Fedora upstream rpm Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 6973afd70254565b67e6c5304f7d271a37a25072 Author: MORITA Kazutaka Date: Sat May 21 21:01:32 2011 +0900 sheepdog 0.2.3 Signed-off-by: MORITA Kazutaka commit 5d2bd3e8ac15de33d88557ea7d2052320c413bc4 Author: MORITA Kazutaka Date: Mon Apr 25 03:25:45 2011 +0900 retry getting object list when an error occurs Signed-off-by: MORITA Kazutaka commit e733d3fbcabb3473f122e1c735f3931f06b24947 Author: MORITA Kazutaka Date: Mon Apr 25 03:24:54 2011 +0900 remove failed node from cpg ring This patch handles local disk crash and file system unmount while Sheepdog is running. Signed-off-by: MORITA Kazutaka commit e830d5abfb9c429ffcac802460fe3849148711ac Author: MORITA Kazutaka Date: Mon Apr 25 03:20:20 2011 +0900 return SD_RES_NEW_NODE_VER when accessing recovering objects If the request needs checking epoch, we shouldn't add it to the waitlist but return SD_RES_NEW_NODE_VER. It is because the sender node could update epoch after sending requests. Signed-off-by: MORITA Kazutaka commit f985cd5ce9fe20d4226c7db30e1f349652f9c99b Author: MORITA Kazutaka Date: Mon Apr 25 03:17:45 2011 +0900 return SD_RES_NEW_NODE_VER during node membership change Sheepdog cannot process any I/O requests during the node membership change to keep strong consistency, so we must return SD_RES_NEW_NODE_VER instead of adding them to a waitlist queue. Signed-off-by: MORITA Kazutaka commit 3e4b21fa98c0d328ad41318c6a0effe702e46d77 Author: MORITA Kazutaka Date: Mon Apr 25 03:17:45 2011 +0900 fix wrong call of setup_access_to_local_objects() We can call this function only when the request heaader is sd_obj_req. Signed-off-by: MORITA Kazutaka commit c2dae90153b34067064e5578be7081d36c7d0dfe Author: MORITA Kazutaka Date: Mon Apr 25 03:14:24 2011 +0900 check epoch when the request causes local access Even though SD_FLAG_CMD_DIRECT is not set, local accesses could happen if the target object is in the local node. Signed-off-by: MORITA Kazutaka commit 18e082de6ce1040d560e332e06bca62e1d6140e3 Author: MORITA Kazutaka Date: Mon Apr 25 03:14:24 2011 +0900 retry when network error occurs The node membership change causes network errors, but we shouldn't notify the errors to clients. Sheepdog should be a more available system. Signed-off-by: MORITA Kazutaka commit 31f0131ffa0d6fe5bdb18365415a56f25973aea8 Author: MORITA Kazutaka Date: Mon Apr 25 03:14:24 2011 +0900 set return value in all cases Signed-off-by: MORITA Kazutaka commit b8ae389fce6508c7d6d865dd6a54f3fefe64bcb9 Author: MORITA Kazutaka Date: Mon Apr 25 03:14:24 2011 +0900 fix reading responses of forwarded write requests We cannot exit forward_write_obj_req() just when an error occurs because other connections may be during data receiving. Signed-off-by: MORITA Kazutaka commit c1c678dce9ed6214c21327866fcba9b521611708 Author: MORITA Kazutaka Date: Mon Apr 25 03:14:24 2011 +0900 avoid calling update_cluster_info() in worker threads update_cluster_info() updates global variables, so we can this function only in main thread. Signed-off-by: MORITA Kazutaka commit 007e3cf5f1e728ddeed30edd9164c7990bf30c81 Author: MORITA Kazutaka Date: Mon Apr 25 03:14:24 2011 +0900 check return value of get_nth_node() Signed-off-by: MORITA Kazutaka commit db4b77169fc13cc04bdce1bd45bbcde2a95fc2c7 Author: MORITA Kazutaka Date: Mon Apr 25 03:48:38 2011 +0900 fix I/O accesses to multiple unrecovered objects If clients access unrecovered objects, Sheepdog should recover them first. This fixes a bug which occurs when the number of such objects are more than one. Signed-off-by: MORITA Kazutaka commit 36f4207ebf9aff1b8d2d27c80e5d795ec9968e0c Author: MORITA Kazutaka Date: Mon Apr 25 03:48:38 2011 +0900 introduce object recovery state This patch introduces a state of object recovery to guarantee the followings: - is_recoverying_oid() returns 1 while preparing recovery - we access rw->done only in main thread Signed-off-by: MORITA Kazutaka commit b8c46eb6c91694c4b39028f6fb07c018a2d26323 Author: MORITA Kazutaka Date: Mon Apr 25 03:47:16 2011 +0900 use more suitable type We use void* buf field on sturct recovery_work only for object IDs. uint64_t* is a better type here. Signed-off-by: MORITA Kazutaka commit ce22e62d5840ac30ee1a5383c20296fd41a87c4f Author: MORITA Kazutaka Date: Sat Apr 9 03:54:10 2011 +0900 sheep: ignore SIGPIPE Signed-off-by: MORITA Kazutaka commit b6d860d00e43d86baaff4af0acadd3e93a7d286e Author: MORITA Kazutaka Date: Thu Apr 7 06:39:07 2011 +0900 send only header when error occurs The content of data is not used on error, so we can avoid sending it. Signed-off-by: MORITA Kazutaka commit 3af29adde0329a0bdd8fcda413b1210dad590bec Author: MORITA Kazutaka Date: Thu Apr 7 06:38:18 2011 +0900 fix wrong error checks in fix_object_consistency() Signed-off-by: MORITA Kazutaka commit 1626413e7a63e63b52b0a67107d1e576ec71b90a Author: MORITA Kazutaka Date: Thu Apr 7 06:36:42 2011 +0900 avoid non-blocking in worker threads In worker threads, all I/Os are retried when EAGAIN is returned, so it is better not to set O_NONBLOCKING. Signed-off-by: MORITA Kazutaka commit 113b7e6aa3773f1cac85e50836f49d7863afd8f0 Author: MORITA Kazutaka Date: Wed Apr 6 17:08:40 2011 +0900 make socket non-blocking Signed-off-by: MORITA Kazutaka commit 6c4796cd1f1ed6a5ab99584ad6f1fd88769f538c Author: MORITA Kazutaka Date: Wed Apr 6 16:24:26 2011 +0900 move sd_nonblocking and sd_nodelay to common functions Signed-off-by: MORITA Kazutaka commit 0c26436d5a587d1df3c2838109bb2d4f1ccacd20 Author: MORITA Kazutaka Date: Thu Apr 7 08:02:28 2011 +0900 suppor direct IO O_SYNC on ext3/4 with barrier=1 and btrfs causes severe performance problems. This introduces -D option to sheep command line arguments, and enables O_DIRECT for data objects. TODO: Enables O_DIRECT for other kinds of objects (e.g. vdi objects) Signed-off-by: MORITA Kazutaka commit ca0de9d69393c6bc610569a5a290fe5fe7f6529b Author: MORITA Kazutaka Date: Wed Apr 6 13:42:23 2011 +0900 distinguish different kinds of objects Sheepdog has four kinds of objects: vdi object - contains metadata of vdi data object - contains actual data of vdi vmstate object - contains vmstate data which is used for live snapshot vdi attr object - contains vdi attributes This patch distinguishes these objects and makes codes easy to maintain. Signed-off-by: MORITA Kazutaka commit 23dfc04a29a85f568999800f6a397d3111699421 Author: MORITA Kazutaka Date: Wed Apr 6 00:52:45 2011 +0900 sheep: fix 32-bit integer overflow in stat_sheep() Signed-off-by: MORITA Kazutaka commit 77fe5530431d1e19cfdd97cd6250b83a4fac005f Author: MORITA Kazutaka Date: Tue Apr 5 17:48:53 2011 +0900 introduce virtual nodes Currently, Sheepdog data balancing has some problems: - When there are only few physical nodes in the cluster, the consistent hash ring becomes sparse and data cannot be balanced well. - Even if some nodes have a larger disk space, we cannot allocate more data to the nodes. This adds preliminary support for virtual nodes; Sheepdog assigns multiple virtual nodes to each physical node, and creates a consistent hash ring with virtual nodes. The number of virtual nodes are fixed to 64 in this patch, but we can extend it in future. This patch changes the map between objects and nodes. So, we need to reformat Sheepdog cluster to try this feature, but this is a necessary change, I think. Signed-off-by: MORITA Kazutaka commit 31b6b32eb0e23c58758d975e3d97a780a0841680 Author: MORITA Kazutaka Date: Sun Apr 3 18:39:02 2011 +0900 sheep: fix double node failure recovery If the target epoch doesn't have the object on recovery, we must read from its previous epoch. Signed-off-by: MORITA Kazutaka commit 4b5d56548d622a4c5b7484a65a67c77598830d04 Author: MORITA Kazutaka Date: Sun Apr 3 18:36:19 2011 +0900 sheep: print the joining node address on error When the newly node fails to join Sheepdog, its address is important information. Signed-off-by: MORITA Kazutaka commit 204bb5d265be064c035a987f355212e44332373a Author: MORITA Kazutaka Date: Wed Mar 30 19:34:00 2011 +0900 sheep: use timerfd for the timer implementation Signed-off-by: MORITA Kazutaka commit 37bc4ee4ddfe14617ad8e774f4ca5be23ff1a52c Author: MORITA Kazutaka Date: Wed Mar 30 13:38:32 2011 +0900 sheep: use eventfd for work queue event notification Signed-off-by: MORITA Kazutaka commit 3ef643bacfc96c2cd77ca89d65d5ef8b261e24ba Author: MORITA Kazutaka Date: Mon Mar 21 14:48:47 2011 +0900 sheep: add comments about object recovery Signed-off-by: MORITA Kazutaka commit 03ab9e8e91a449ec523fe78d064107a40376ba61 Author: MORITA Kazutaka Date: Thu Mar 17 14:18:40 2011 +0900 sheep: fix the check of the return value from write_object() On error, write_object() returns a non-zero positive number. Signed-off-by: MORITA Kazutaka commit d2da348026bd09c646a1161ceb5db486b93b334c Author: MORITA Kazutaka Date: Tue Mar 15 11:39:54 2011 +0900 sheepdog 0.2.2 Signed-off-by: MORITA Kazutaka commit 66b602a7c060281584f762ce8583d7d0c36191fc Author: MORITA Kazutaka Date: Wed Mar 9 04:15:30 2011 +0900 update copyright year to 2011 Signed-off-by: MORITA Kazutaka commit d7c765a6e943e32c5288e593761455d9ba3fb600 Author: MORITA Kazutaka Date: Wed Mar 9 04:02:18 2011 +0900 sheep: fix check of the return value of read_object() Signed-off-by: MORITA Kazutaka commit f3aed1b88419077916a6a57ee4b07ff335b53ee7 Author: MORITA Kazutaka Date: Thu Feb 17 02:27:09 2011 +0900 sheep: remove cow_oid from the busy object id list A dead-lock happens in the following situation: 1. There are two nodes ('A' and 'B') in Sheepdog. 2. A CoW request (oid = 200, cow_oid = 100) is sent to the node 'A'. 3. The object '100' is stored in the node 'A', and the object '200' will be stored in the node 'B' 4. To serialize accesses, the object '100' becomes a busy object in the node 'A'. 5. The node 'A' forwards the CoW request to the node 'B'. 6. The node 'B' reads the object '100' from the node 'A', but the object is busy (dead-lock). In fact, the source object of CoW must be read-only, so we don't need to serialize the accesses to it. This patch removes the object from the busy object list. Signed-off-by: MORITA Kazutaka commit d281abdc7cd68023ebd849ec8bc9457865b3eefc Author: MORITA Kazutaka Date: Wed Feb 16 01:55:21 2011 +0900 add vdi attributes support It is useful to store metadata associated with virtual disks. This patch adds support for vdi attributes. Usage: $ collie vdi getattr VDINAME KEY # get value $ collie vdi setattr VDINAME KEY [-x] VALUE # set value $ collie vdi setattr VDINAME KEY [-x] < VALUEFILE # set value from stdin $ collie vdi setattr VDINAME KEY -d # delete attribute -x is an exclusive option; if you set the option and the key already exists, the operation fails. Signed-off-by: MORITA Kazutaka commit f10470a930349895e4da596669fea1c8f7f41524 Author: MORITA Kazutaka Date: Wed Feb 16 01:53:00 2011 +0900 set the number of copies in the response of vdi operations We need to know the number of replications to read/write objects directly. This patch sets the number in the response of vdi operations, and avoid redundant accesses to the vdi objects. Signed-off-by: MORITA Kazutaka commit 17e5240fb69dcf914df65b24eea95206e3e6530c Author: MORITA Kazutaka Date: Wed Feb 9 20:23:07 2011 +0900 sheep: set error reason in read_object() Signed-off-by: MORITA Kazutaka commit 6eb04141affa0d6af1725080a89d8ce84af138f6 Author: MORITA Kazutaka Date: Sun Feb 6 00:56:40 2011 +0900 sheep: avoid reading entire vdi object The size of a vdi object is large (4 MB), so we should read/write the object carefully not to reduce the performance. Signed-off-by: MORITA Kazutaka commit 2a5f3be526913e10c4b89238f9749dd493b17ba5 Author: MORITA Kazutaka Date: Sun Feb 6 00:05:52 2011 +0900 sheep: remove a redundant fix of consistency in read_object() read_object() should access only read-only objects, so it is not needed to fix consistency. Signed-off-by: MORITA Kazutaka commit da3fdf129f4610b98cfed4f6656f228673939a0d Author: MORITA Kazutaka Date: Sat Feb 5 21:29:27 2011 +0900 support deleting a snapshot vdi with a tag name This enable us to specify a tag name in snapshot deletion. $ qemu-img snapshot -c tag sheepdog:vdi # create snapshot $ collie vdi delete vdi -s tag # delete snapshot with tag Deleting a snapshot with a snapshot id is also still supported. Signed-off-by: MORITA Kazutaka commit 3fad7de649e4cdda3b9a7741dc8ff2817039e661 Author: MORITA Kazutaka Date: Sat Feb 5 03:21:17 2011 +0900 sheep: return the error reason in start_deletion() The function could fail because of several reasons, so it is better to return the error reason here. Signed-off-by: MORITA Kazutaka commit c6b7ca9dcbe1d6422a8cc4586521c8348d3e01bc Author: MORITA Kazutaka Date: Sat Feb 5 03:08:17 2011 +0900 sheep: pass a correct parameter to read_object() The value should be the number of copies, not the number of nodes. Signed-off-by: MORITA Kazutaka commit e0721b53c8b5fe1167410788dd7cc74048348530 Author: MORITA Kazutaka Date: Fri Feb 4 21:53:33 2011 +0900 sheepdog 0.2.1 Signed-off-by: MORITA Kazutaka commit 32ae69d898721887253868ebe3d78b6f927ea6f9 Author: MORITA Kazutaka Date: Thu Feb 3 02:08:25 2011 +0900 sheep: avoid overwriting a request header in forward_*_obj_req() We cannot overwrite the request header in forward_*_obj_req() because its fields are reffered in __done(). Signed-off-by: MORITA Kazutaka commit b9af7e81e8c53c6d7e96a6638a36583c4372b203 Author: MORITA Kazutaka Date: Wed Feb 2 01:22:01 2011 +0900 collie: avoid reading entire vdi object This improves performance of some collie commands significantly. Signed-off-by: MORITA Kazutaka commit 58c88203d27725eb7d8c4413cb0f10f28da3b8a0 Author: MORITA Kazutaka Date: Tue Feb 1 02:29:10 2011 +0900 sheep: send read requests directly in copy-on-write The target objects could be read from multiple nodes, so it is not safe to use non-direct access here. Signed-off-by: MORITA Kazutaka commit 4856cb770c3f8a4d9625d6f1f262775e365fbe6f Author: MORITA Kazutaka Date: Mon Jan 31 21:39:03 2011 +0900 collie: read objects directly in parse_objs() When we send read requests without a SD_FLAG_CMD_DIRECT flag, Sheepdog checks data consistency of the target objects before forwarding them. However, fixing consistency could cause updating, so it is not safe for collie to send non-direct read requests because running VMs could update the objects at the same time. This patch calculates the target nodes in parse_objs(), and send read requests directly to them. This also improves performance of vdi listing because we can avoid a redundant consistency check. Signed-off-by: MORITA Kazutaka commit 18e8c283aa9c338e759aebcc814576d111999342 Author: MORITA Kazutaka Date: Thu Jan 27 19:55:01 2011 +0900 sheep: fix vdi deletion We shouldn't call read_object()/write_object() when CPG_EVENT_WORK_RUNNING is not set. Signed-off-by: MORITA Kazutaka commit cb4ed6e3ca922b1647f776f29987b3720ae1511f Author: MORITA Kazutaka Date: Thu Jan 27 16:34:36 2011 +0900 sheep: access local objects directly This patch fixes a dead lock which happens when sending vdi creation/deletion requests continuously. Signed-off-by: MORITA Kazutaka commit e5bccb301f57d089c44275585f3de5a79487c041 Author: MORITA Kazutaka Date: Thu Jan 27 16:33:58 2011 +0900 sheep: fix consistency recovery The content of hdr could be overwritten after sending a read request, so we need to set its fields again. Signed-off-by: MORITA Kazutaka commit cfafa6e078aa7494b80b12b2666867a7115ff670 Author: MORITA Kazutaka Date: Thu Jan 27 16:33:27 2011 +0900 sheep: remove redundant requests The number of replicaton is at most the number of nodes. Signed-off-by: MORITA Kazutaka commit bed7084374d4c111c75a3a40339be8ca0b4d1eee Author: MORITA Kazutaka Date: Thu Jan 27 16:32:50 2011 +0900 sheep: avoid using reqs whose connection is closed This patch fixes a memory violation which occurs when a connection is suddenly closed. Signed-off-by: MORITA Kazutaka commit 21767cc74bbf99bce9830b9bad605cca825351eb Author: MORITA Kazutaka Date: Thu Jan 13 16:43:18 2011 +0900 sheep: fix a fd leak in jrnl_perform() Signed-off-by: MORITA Kazutaka commit 411924474d8ea4307dadcf4e23e483592db614b0 Author: MORITA Kazutaka Date: Mon Jan 3 16:12:38 2011 +0900 sheepdog 0.2.0 Signed-off-by: MORITA Kazutaka commit 8ebe0c896263f3e1f887855fe4488bf9ea0d3625 Author: MORITA Kazutaka Date: Sun Jan 2 21:13:34 2011 +0900 sheep: don't remove vdi objects If vdi objects are deleted from the vdi bitmap, sheepdog could fail to look up vdis. So it is safe to keep them unremoved. Signed-off-by: MORITA Kazutaka commit 1063b1a99b735f5c39002c3ab83e2ec788c46d13 Author: MORITA Kazutaka Date: Sun Jan 2 20:40:01 2011 +0900 update bash_completion_collie - support completion of the vdi name - remove "vm list" commands - remove "vdi lock" commands - remove "vdi release" commands Signed-off-by: MORITA Kazutaka commit 1f721d07cae3a6b8dde712eeca9f40186816f6b9 Author: MORITA Kazutaka Date: Sun Jan 2 07:42:46 2011 +0900 sheep: use PRIx64 instead of "lx" in the printf() format Signed-off-by: MORITA Kazutaka commit 519536ba533ca363d414e5e9fbd5c2b1cf75a4d6 Author: MORITA Kazutaka Date: Sun Jan 2 07:38:23 2011 +0900 sheep: check return value of write() in __start_recovery() Signed-off-by: MORITA Kazutaka commit 8389b80041083faa55065be1d3121c5f6b6e7214 Author: MORITA Kazutaka Date: Sun Jan 2 06:10:35 2011 +0900 collie: fix wrong escape Signed-off-by: MORITA Kazutaka commit be6231a7f79d74a65a060f860bb9db6296818d7e Author: MORITA Kazutaka Date: Sun Jan 2 05:19:56 2011 +0900 sheep: remove unused function cpg_context_set() Signed-off-by: MORITA Kazutaka commit 518a2d9a496caf60ab2ae30773baa75d85f5a3c8 Author: MORITA Kazutaka Date: Sun Jan 2 04:31:09 2011 +0900 sheep: handle SD_OP_STAT_CLUSTER even when the node failed to join The results of SD_OP_STAT_CLUSTER can tell us good information about why this node failed to join the sheepdog cluster, so the request should be handled even when the cluster status is SD_STATUS_JOIN_FAILED. Signed-off-by: MORITA Kazutaka commit a7ee82837b6cab13142b4cdbb2c1b7d6bde6ec0b Author: MORITA Kazutaka Date: Sun Jan 2 04:10:53 2011 +0900 sheep: clear a vdi bitmap when the cluster is formatted Signed-off-by: MORITA Kazutaka commit 178b4cb1d67e555769f1046f1b63b7f89638bfea Author: MORITA Kazutaka Date: Sun Jan 2 04:07:49 2011 +0900 sheep: remove a journal directory in remove_epoch() Signed-off-by: MORITA Kazutaka commit 77456c9c1f14aeb1866559dfd826d1448ff77cd0 Author: MORITA Kazutaka Date: Sun Jan 2 03:54:55 2011 +0900 Revert "sheep: fix vdi deletion" To search an unused bit correctly, we cannot clear a vdi bit after deleting VDIs. This partially reverts commit c046e740ea. Signed-off-by: MORITA Kazutaka commit 5244d6cdfaadc8759a5236360a7e3f6b3780d9d3 Author: MORITA Kazutaka Date: Sun Jan 2 02:40:23 2011 +0900 sheep: write the previous epoch log only when epoch is incremented Signed-off-by: MORITA Kazutaka commit 5edd4c6cf39f380eb2ac921661b66e304f407f7f Author: MORITA Kazutaka Date: Sun Jan 2 01:29:13 2011 +0900 sheep: shutdown after processing outstanding requests Signed-off-by: MORITA Kazutaka commit 386aeed63a540dc4b743a45b461b7a006b57f0f1 Author: MORITA Kazutaka Date: Sun Jan 2 00:20:04 2011 +0900 collie: fix options of the vdi object command Usage of the vdi object command: $ collie vdi object [-s snapshot_id] [-i index] If you don't specify the -i option, this command shows info about the vdi object. Signed-off-by: MORITA Kazutaka commit 757e67882a0bc1f88df2b69bc6c5c81933f9b03b Author: MORITA Kazutaka Date: Sat Jan 1 23:38:50 2011 +0900 collie: fix snapshot delete option This patch fixes a regression of a1bf95500c. You can delete the snapshot vdi with the following command: $ collie vdi delete vdiname [-s snapshot_id] Signed-off-by: MORITA Kazutaka commit a21aea9997f543b0407b4105a8770b1e29313f4b Author: MORITA Kazutaka Date: Sat Jan 1 18:22:26 2011 +0900 sheep: add missing write() in the copy-on-write operation Signed-off-by: MORITA Kazutaka commit 77444e0aaf9681ed70f419171b0f961f95da687f Author: MORITA Kazutaka Date: Fri Dec 31 03:10:18 2010 +0900 sheep: fix I/O blocking problem during object recovery Currently, Sheepdog blocks I/O operations when the target objects are not recovered yet. This patch recovers such objects first, and reduces the time of blocking. Signed-off-by: MORITA Kazutaka commit f8d7893cc8ca484a8bca0f1d325321bf8ebe71cc Author: MORITA Kazutaka Date: Tue Dec 28 18:38:46 2010 +0900 sheep: recover objects atomically Node failure could happen during object recovery, so it should be done atomically. This patch supports atomic object recovery with rename(2). Signed-off-by: MORITA Kazutaka commit 5081a39f07cba3b0d2c78713d5b5a15604c7a898 Author: MORITA Kazutaka Date: Mon Dec 27 02:43:22 2010 +0900 collie: remove vdi lock feature This patch removes the following operations: collie vm list # list all vdis with lock information collie vdi lock # lock vdi collie vdi release # unlock vdi These functionalities should be provided outside the storage system. Signed-off-by: MORITA Kazutaka commit fe14318e31d8d06a62d34fa135a9c094c46cd2e5 Author: MORITA Kazutaka Date: Mon Dec 27 02:39:39 2010 +0900 sheep: remove vdi lock feature The vdi lock feature causes the following problems: - To support the lock feature, all sheepdog nodes have a list of locked images in memory. When nodes are newly joined to the cluster, Sheepdog sends the list to them with a corosync multicast. However, the size of the list can be large when we open many images, and in that case, we cannot send the list with one mcast message because of the restriction of corosync. Currently, sheepdog sends the list with multiple mcast messages, but it makes the codes hard to read. - When doing a live migration, qemu needs to open its image on source host and destination host at the same time, but the locking feature prevents it. - When qemu crashes, sheepdog needs to detect it and release the lock. However it is difficult to detect the aliveness of VMs strictly if they run outside the cluster. This patch removes the lock feature and solves the above problems. Signed-off-by: MORITA Kazutaka commit 0c1e237302b214bcc58e9c2a2754a86285e94d21 Author: Narendra Prasad Madanapalli Date: Mon Dec 27 17:03:29 2010 +0900 Journal support for atomic operations This patch adds the feature of atomicity while performing vdi data object update/write operation. With the help of the journalling API, implemented the task of updating vdi object atomically in store_queue_request_local() for the operations SD_OP_WRITE_OBJ & SD_OP_CREATE_AND_WRITE_OBJ. Signed-off-by: Narendra Signed-off-by: MORITA Kazutaka commit 423f21c0f3e243344c56cbbf33052a32b3da4248 Author: MORITA Kazutaka Date: Sun Dec 26 01:32:51 2010 +0900 sheep: fix data consistency when reading objects for the first time If total node failure happens, data consistency of replicated objects could be broken. This patch overwrites replicated objects with the same data and recovers the data consistency when qemu reads the objects for the first time. Signed-off-by: MORITA Kazutaka commit 5bc71fa81f98cebea3911e24a80599331188917b Author: MORITA Kazutaka Date: Sun Dec 26 01:51:48 2010 +0900 sheep: remove verify_object() verify_object() was used for /[stored dir]/obj/[epoch]/list, which contains the list of object IDs. This patch replaces the function to the following simpler procedures: 1. create "list.tmp" 2. write data to list.tmp 3. rename "list.tmp" to "list" rename(2) is an atomic operation, so we can create the file in the all or nothing way; we don't need verification of it. Signed-off-by: MORITA Kazutaka commit e8807dc758ae7aeacfb54b76c60b75101d47c525 Author: MORITA Kazutaka Date: Wed Oct 27 14:08:41 2010 +0900 sheep: call free_request() after decrementing reference counters We cannot call free_req() here because client_decref() accesses req->ci. Signed-off-by: MORITA Kazutaka commit 211d2f1f9bfe90cdd0f3cb9516576cb34b01a445 Author: MORITA Kazutaka Date: Mon Oct 25 13:41:09 2010 +0900 sheep: cache socket discriptors This patch reuses socket discriptors when accessing data objects, and improves latency. Signed-off-by: MORITA Kazutaka commit 72b3c7fdc975cc541cf42cd7339e06c7a392c2f8 Author: MORITA Kazutaka Date: Mon Oct 25 13:17:31 2010 +0900 sheep: use O_SYNC flag for opening objects A sheepdog qemu block driver doesn't implement bdrv_flush(), so sheepdog does not support writeback semantics now. In this case, opening object with O_SYNC and silently upgrade to writethrough semantics is safe. Signed-off-by: MORITA Kazutaka commit 452bc6a4f51ba96abd907ff9bf26cfbec0a89cb8 Author: MORITA Kazutaka Date: Tue Oct 19 17:28:46 2010 +0900 sheep: avoid calling vdi_op_done() in worker threads We cannot call vdi_op_done() in worker thread, so this patch moves it to cpg_event_done(). Signed-off-by: MORITA Kazutaka commit 80c3b08e377bc457e2a534e2cfb682766054e9bf Author: MORITA Kazutaka Date: Fri Oct 15 21:26:14 2010 +0900 sheep: call start_recovery when cluster restarts with one node Sheepdog recovers objects before starting a storage service, and the routine is called when nodes are joined. However If sheepdog consists of only one node, no node doesn't send join messages, so start_recovery doesn't called. This patch fixes the problem. Signed-off-by: MORITA Kazutaka commit c6fe02039703b8d15bd29eb92cdc2bbfa8704fed Author: MORITA Kazutaka Date: Thu Sep 23 15:46:10 2010 +0900 add support for GNU Flymake This enables an on-the-fly syntax checker for Emacs. Signed-off-by: MORITA Kazutaka commit 563862cf12da1ff4e9d2cfa2605edfbe842ff687 Author: Steven Dake Date: Fri Sep 17 10:01:02 2010 -0700 Place INSTALL directions in INSTALL file It is customary to have a separate INSTALL file which explains how to build, compile, and install software from source (or upstream pre-built repos). This patch separates those instructions into separate files. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit b69258b41fb15a18b06fe5d93ac15341c96de860 Author: Narendra Prasad Madanapalli Date: Wed Sep 15 06:23:37 2010 +0900 Enable IPv6 support for addr_to_str() Enable IPv6 support in the function addr_to_str() and modify print_node_list_entry() to make use of addr_to_str(). Signed-off-by: Narendra Signed-off-by: MORITA Kazutaka commit a7291e17c75e2f7247e0b9be7965cc0da7e08edc Author: Steven Dake Date: Fri Sep 3 10:10:31 2010 -0700 Change shutdown priority from 21 to 79 The previous shutdown priority was incorrect, resulting in corosync shutting down before sheepdog shuts down on system shutdown. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 5a72f8b39a9b97c04e4f87566b91eb4b0f31218d Author: MORITA Kazutaka Date: Fri Sep 3 16:49:51 2010 +0900 sheep: add -lpthread to linker options This patch fixes compile errors on some distributions. Signed-off-by: MORITA Kazutaka commit 90d31b24fa5bc1a4c6d617dedb6f2212de3c0025 Author: MORITA Kazutaka Date: Fri Sep 3 14:05:12 2010 +0900 collie: set SD_FLAG_CMD_DIRECT flag to show object information This patch fixes the problem that 'collie vdi object' shows wrong information. Signed-off-by: MORITA Kazutaka commit ea661927c7c4ebc9925d8793732acea747ca1300 Author: MORITA Kazutaka Date: Sun Aug 29 04:22:54 2010 +0900 sheep: make recovery requests success in any cases I/O operations to recover objects can success always because they are read operations against the immutable objects (in the past epoch). This patch makes them success in any cases, and avoid a dead lock between regular I/O operations and recovery I/O operations. Signed-off-by: MORITA Kazutaka commit 933871ca990e6e606901ef80b36ef1ea4102f791 Author: MORITA Kazutaka Date: Sun Aug 29 04:10:34 2010 +0900 sheep: fix epoch incrementation In join operations, the condition to increment epoch is whether sheepdog cluster is already running or not. In this patch, the master node notifies the information to all the node in the response of join messages. Signed-off-by: MORITA Kazutaka commit 488b7a8bf776d02967e92479a2a9ce657d8005bb Author: MORITA Kazutaka Date: Sun Aug 29 03:51:15 2010 +0900 sheep: fix retry conditions of forwarding operations We must take into account the following cases: - the target node is down and epoch will change soon - the target node does not finish join operations Signed-off-by: MORITA Kazutaka commit e2db37559f1c1cb47f92097cd71e2ceaba4c4b66 Author: MORITA Kazutaka Date: Sun Aug 29 03:38:44 2010 +0900 sheep: unset SD_FLAG_CMD_DIRECT before returning from forward functions hdr->flags will be referred later to check whether the request should be retried or not, so we must unset SD_FLAG_CMD_DIRECT before exiting these functions. Signed-off-by: MORITA Kazutaka commit c2155c8edd04759855b4767d4b9b22d12060724e Author: MORITA Kazutaka Date: Sun Aug 29 03:20:16 2010 +0900 sheep: avoid processing I/O until the target object is recovered I/O operations fail always if the target objects are not recovered from the previous node membership change, so such operations should be delayed until the object is recovered. This patch sorts the order of recovering objects and makes it easy to check whether the object is recovered or not. Signed-off-by: MORITA Kazutaka commit 2e1ca38c06d20ffdf70b6df768d517a2123bd804 Author: MORITA Kazutaka Date: Sun Aug 29 03:08:58 2010 +0900 sheep: fix socket descriptor leak Connections could be closed before sending a response, so we need to call client_decref() before watching socket descriptors. Signed-off-by: MORITA Kazutaka commit e2e90e22c62b139e3b28ee641384ab27348d2b2a Author: MORITA Kazutaka Date: Sun Aug 29 03:00:49 2010 +0900 sheep: set the number of replication properly Data redundancy cannot be larger than the number of nodes. Signed-off-by: MORITA Kazutaka commit bac4d2110d55c47c7b0524b8f7a24410281108b2 Author: MORITA Kazutaka Date: Sun Aug 29 02:49:43 2010 +0900 sheep: avoid manipulating a node list in worker threads We cannot call add_node/del_node in worker threads. This patch moves these routines from __sd_confchg (worker threads) to __sd_confchg_done (main process). Signed-off-by: MORITA Kazutaka commit 89e292b37e5cb86333409d529179f5ec4fbf447b Author: Steven Dake Date: Wed Aug 25 00:17:16 2010 -0700 Add sheep.8 man page and man page infrastructure to build system/rpm gen This patch adds a sheep.8 man page based upon the README instructions and reading of the sheep source code for command line options. While this man page isn't perfect, certainly a good start and better then nothing. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 4d4d2a928fbc098ab1cf6afdd01bcad540afe7b4 Author: Steven Dake Date: Mon Aug 23 15:30:41 2010 -0700 Add LSB style init script to build system This patch adds an LSB style init script to the build system. It also installs it within the created RPM file. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 50443fe797a53164fd45ccea82983cf58ad1fdd9 Author: Steven Dake Date: Fri Aug 20 11:38:02 2010 -0700 Add sheepdog RPM generation to autotools system This patch adds the logic necessary to generate an RPM file within the sheepdog working dir by running the command "make rpm". Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit c92d3d8a5aa70177a3020f0c6d953b8a73952fe8 Author: MORITA Kazutaka Date: Fri Aug 6 01:53:29 2010 +0900 sheepdog 0.1.0 Signed-off-by: MORITA Kazutaka commit c046e740ea7c84d8670c348b53305a0f931e74d0 Author: MORITA Kazutaka Date: Fri Jul 23 23:42:15 2010 +0900 sheep: fix vdi deletion We need to clear a bit from a vdi bitmap after deleting VDIs. Signed-off-by: MORITA Kazutaka commit aeb233f5a83da6457617ec0706cb3895defd36cf Author: MORITA Kazutaka Date: Fri Jul 9 12:14:15 2010 +0900 add a COPYING file Signed-off-by: MORITA Kazutaka commit fa3f041dc9f8ab729a015edb6fb81eaa90e632c2 Author: MORITA Kazutaka Date: Wed Jul 7 16:57:40 2010 +0900 update README The Sheepdog client is included into the qemu mainline, so use the mainline tree from now on. Signed-off-by: MORITA Kazutaka commit ab19300721ef61adedb2d215aabd744441a195bd Author: MORITA Kazutaka Date: Tue Jul 6 20:08:29 2010 +0900 sheep: increase the number of threads The current implementation limits the number of VMs on the same host machines to the number of worker threads; we can run only 4 VMs for each host machines. This patch increases the number of threads (and the number of VMs) from 4 to 64. I think this fix is temporary and we need to remove the limitation about the number of VMs in future. Signed-off-by: MORITA Kazutaka commit b5a1a193a5210aa95ab4cb2f0ea72d6ffc05d263 Author: MORITA Kazutaka Date: Wed Jul 7 16:45:48 2010 +0900 remove zero_block We don't use the zero_block buffer in most cases, so we should allocate the buffer directly when it is required. Signed-off-by: MORITA Kazutaka commit 3d49b7bd31d3ae42500ceb191d3827f1e8948c10 Author: MORITA Kazutaka Date: Tue Jul 6 16:41:33 2010 +0900 retry epoll_wait when errno is EINTR Signed-off-by: MORITA Kazutaka commit 68cc3124884f207d2e1d873e296984d842465768 Author: MORITA Kazutaka Date: Tue Jul 6 16:25:21 2010 +0900 count the number of requests to free a client_info safely When a connection is closed, we cannot free a client_info if server is processing a request which references the client_info. This patch introduces a reference count to check whether we can free the client_info safely. Signed-off-by: MORITA Kazutaka commit 9a2c7d7c83914a1ba6d8ba016be5f752e7deeefd Author: MORITA Kazutaka Date: Fri Jun 18 21:21:06 2010 +0900 sheep: add snapshot tag support This patch enables us to add a snapshot name when creating a snapshot. This is mutually depends on the client patch I'll send later to the qemu-devel. You can also get these from the latest git tree: server: git://sheepdog.git.sourceforge.net/gitroot/sheepdog/sheepdog next client: git://sheepdog.git.sourceforge.net/gitroot/sheepdog/qemu for-block usage: $ qemu-img snapshot -c tagname sheepdog:linux $ ./collie/collie vdi list name id size used shared creation time vdi id ------------------------------------------------------------------ s linux 1 2.0 GB 48 MB 0.0 MB 2010-06-18 20:40 a5d05d linux 2 2.0 GB 0.0 MB 48 MB 2010-06-18 21:23 a5d05e $ qemu-img snapshot -l sheepdog:linux Snapshot list: ID TAG VM SIZE DATE VM CLOCK 1 tagname 0 2010-06-18 21:23:32 00:00:00.000 Signed-off-by: MORITA Kazutaka commit 905f95961c9608d1e7eda9af8a7e7175a769c2df Author: MORITA Kazutaka Date: Thu Jun 17 15:47:20 2010 +0900 collie: add graph view option This patch adds a graph view option to the `collie vdi' command. This is the same as `shepherd info -f graph' in the old syntax. The output text is parsable by Graphviz (graph drawing tools developed by AT&T Research Labs) and help us debugging VDI relation bugs. Example: $ qemu-img convert src.raw sheepdog:linux # create volume $ qemu-img snapshot -c name sheepdog:linux # create snapshot $ qemu-system-x86_64 sheepdog:linux:1 # boot from snapshot $ collie vdi graph # show graph digraph G { node [shape = "box", fontname = "Courier"]; "0" [shape = "ellipse", label = "root"]; "0" -> "a5d05d"; "a5d05d" [ group = "linux", label = "name: linux\ntag : 1\nsize: 20 MB\ndate: 2010-06-17\ntime: 15:12:09" ]; "a5d05d" -> "a5d05e"; "a5d05e" [ group = "linux", label = "name: linux\ntag : 2\nsize: 20 MB\ndate: 2010-06-17\ntime: 15:12:35" ]; "a5d05d" -> "a5d060"; "a5d060" [ group = "linux", label = "name: linux\ntag : 3\nsize: 20 MB\ndate: 2010-06-17\ntime: 15:12:53" color="red" ]; } $ collie vdi graph | dotty - # show graph with graphviz Signed-off-by: MORITA Kazutaka commit c744068d91480a41363954249e5a9964f78d8e4c Author: MORITA Kazutaka Date: Thu May 27 15:55:56 2010 +0900 collie: add tree view option This patch adds a tree view option to the `collie vdi' command. This is the same as `shepherd info -f tree' in the old syntax. Example: $ qemu-img convert src.raw sheepdog:linux # create vdi $ collie vdi tree # show vdi tree linux---(You Are Here) $ qemu-img snapshot -c name sheepdog:linux # create snapshot $ qemu-img snapshot -c name sheepdog:linux # create snapshot $ collie vdi tree # show vdi tree linux---[2010-06-17 15:12]---[2010-06-17 15:12]---(You Are Here) $ qemu-system-x86_64 sheepdog:linux:1 # boot from snapshot $ collie vdi tree # show vdi tree linux---[2010-06-17 15:12]-+-[2010-06-17 15:12]---[2010-06-17 15:12] `-(You Are Here) $ qemu-img snapshot -c name sheepdog:linux # create snapshot $ collie vdi tree # show vdi tree linux---[2010-06-17 15:12]-+-[2010-06-17 15:12]---[2010-06-17 15:12] `-[2010-06-17 15:12]---(You Are Here) Signed-off-by: MORITA Kazutaka commit 9ac045539d13ae6865af6eda00785c49d838bbb0 Author: MORITA Kazutaka Date: Thu May 27 14:42:11 2010 +0900 collie: support collie running outside the cluster With option '-a', you can specify the machine to connect. Signed-off-by: MORITA Kazutaka commit 493dcc3892d58afb008cfdd4c6ba21b06930018a Author: FUJITA Tomonori Date: Mon May 24 16:42:19 2010 +0900 add .gitignore Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 8c336663add0ae36f6cda6a825ceba96a987508c Author: Steven Dake Date: Fri May 21 10:49:25 2010 -0700 Remove #include since not necessary and introduces a curses depend. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 96a439185b33d6793687e10f1097ff814bc28f87 Author: Steven Dake Date: Fri May 21 10:43:16 2010 -0700 Build lib files once into an archive instead of twice for sheep/collie The previous automake setup would build the lib dir files twice. This patch builds a library of files in the lib dir, and then links it with either collie or sheep. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 976bb0a1c5292249b6491d4ea370aefc51409d4a Author: MORITA Kazutaka Date: Fri May 21 20:52:41 2010 +0900 update README Changes from the previous version are: - change the client tree from qemu-kvm to qemu - change the sheepdog driver from format to protocol - add `./autogen.sh' and `./configure' to build commands Signed-off-by: MORITA Kazutaka commit 85e9574b5b350b608a378af5355bc6c62eae1014 Author: MORITA Kazutaka Date: Fri May 21 20:21:33 2010 +0900 shadow problem: attr() is shadowed by its parameter Rename a parameter attr to name. Signed-off-by: MORITA Kazutaka commit 28ae1dc65189abde7dcf06deb91a54a66de7f93b Author: MORITA Kazutaka Date: Fri May 21 20:16:16 2010 +0900 define _GNU_SOURCE to use pread64() and pwrite64() Signed-off-by: MORITA Kazutaka commit 6f53229a59cdc17784586a78812439438578de30 Author: MORITA Kazutaka Date: Fri May 21 19:55:11 2010 +0900 shadow problem: w is shadowed by a local in a function Rename a function parameter w to work. Signed-off-by: MORITA Kazutaka commit f7f928966738ca7162866a351e55c11216c1eae2 Author: MORITA Kazutaka Date: Fri May 21 19:49:52 2010 +0900 shadow problem: daemon is defined in but used as a local in a function Rename variable daemon to is_daemon. Signed-off-by: MORITA Kazutaka commit e3130b1bdd23225310dc3dd6492455b34635487a Author: MORITA Kazutaka Date: Fri May 21 19:42:41 2010 +0900 avoid using a void pointer in arithmetic Signed-off-by: MORITA Kazutaka commit 26325ee99df11017a80a5220e7d58fad23817f23 Author: MORITA Kazutaka Date: Fri May 21 19:37:24 2010 +0900 avoid mixed declarations and code Signed-off-by: MORITA Kazutaka commit cbe548bab257426151600bcad6fac0ea2a000b4d Author: MORITA Kazutaka Date: Fri May 21 19:32:19 2010 +0900 add a format attribute to the functions which have a format parameter Signed-off-by: MORITA Kazutaka commit f2a960003941405d6f0a4df5a2459b3888188f0b Author: MORITA Kazutaka Date: Fri May 21 18:34:44 2010 +0900 use a string literal for printf formatting It is not safe to use a variable for printf formatting. Signed-off-by: MORITA Kazutaka commit 0894ea50cbe76894339fa31f9bf399f05c9b820a Author: MORITA Kazutaka Date: Fri May 21 18:26:24 2010 +0900 const correctness: change dir in main() from (char *) to (const char *) Signed-off-by: MORITA Kazutaka commit c035ee80c0119e852fceac5ffc915c673d5ccea7 Author: MORITA Kazutaka Date: Fri May 21 18:18:39 2010 +0900 const correctness: make short_options a const char * Signed-off-by: MORITA Kazutaka commit ee7a1fa2c06fe3f488b69f51cd91c111db692a38 Author: MORITA Kazutaka Date: Fri May 21 18:02:55 2010 +0900 collie: use %Y instead of %y in time formatting `%y' format causes a gcc warning Signed-off-by: MORITA Kazutaka commit c0cab2525ab61fae902732ee8fa6df1b54d2a21e Author: MORITA Kazutaka Date: Fri May 21 17:58:18 2010 +0900 const correctness: connect_to() 1st parameter is really a const char * Signed-off-by: MORITA Kazutaka commit c61efd49bd756d211d6c70a2f407a06f8be7a33d Author: MORITA Kazutaka Date: Fri May 21 17:08:14 2010 +0900 sheep: fix cluster_info update What we want to do here are: 1. When sheepdog is already running - increment epoch - write node list information to the local store 2. Otherwise - get vdi bitmap from other nodes - update cluster information in the local store Signed-off-by: MORITA Kazutaka commit 901fba9a0b64f37bd387e6d759dfb43064ae398e Author: Steven Dake Date: Mon May 17 13:56:42 2010 -0700 Remove unused vosts structure The vosts structure is unused and not set, resulting in a potential segfault if the eprintf code is executed. Another alternative is to set vosts[x] but I am not certain what it should be set to. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 2fde2f815142f1a2cf566b04f5e8b4b4f837c214 Author: Steven Dake Date: Mon May 17 13:56:41 2010 -0700 exit_work_queue is unused and file scope First make exit_work_queue static so that it's file scope is honored by the compiler (it uses a file scope structure as a parameter). Second compile it out by default unless the compile flag COMPILE_UNUSED_CODE is defined. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 0e7e6e4c34f86ee27b9645d2f2ecaafd33fcd617 Author: Steven Dake Date: Mon May 17 13:56:40 2010 -0700 use PRIu/xXX macros for store.c The C language doesn't handle typing of the print formatter very well. To solve this problem, the PRIx PRIu followed by bitsize macros are typically used. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 6fd6398b5c2988b85292d5688bccabe93ece872d Author: Steven Dake Date: Mon May 17 13:56:39 2010 -0700 const correctness: attr() 2nd parameter is really a const char * in all cases. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 2c805bd868854c36b922798e2af60b19011929fb Author: Steven Dake Date: Mon May 17 13:56:38 2010 -0700 use PRIuXX macros in group.c The C language has problems around typing and format printing. The usual solution to this problem is to use the PRIuXX macro set. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit b2228333636da3aacff24e8889f9bba570366876 Author: Steven Dake Date: Mon May 17 08:47:24 2010 -0700 Autotoolize sheepdog This patch contains the initial work to autotoolize sheepdog. To generate the configure script, run the autogen.sh script. Then run ./configure followed by make. make install also works properly. Then run configure as normal. The configure.ac supports several options including --enable-profiling and --enable-debug as well as the standard GNU automake setup. The warnings list is a pretty standard list of warning catches, but does generate alot of warning output. One thing that is missing is using the git version field rather then PACKAGE_VERSION generated from configure.ac. Finally, if you want to generate an output tarball, run make dist. make distcheck verifies the make system included all the proper files to support a self-build. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit e4a01ece76122fdeb600165bea688bbc567781da Author: MORITA Kazutaka Date: Thu May 20 16:54:29 2010 +0900 collie: remove ncurses dependency Using ncurses library is overkill. We only want to use the bold text. Signed-off-by: MORITA Kazutaka commit 91dc962bb0f91878ed8e2627b8a93e6cb065f431 Author: MORITA Kazutaka Date: Thu May 20 16:37:21 2010 +0900 collie: turn off highlight option when stdout is not terminal Signed-off-by: MORITA Kazutaka commit 32827020c5db0a22b3b9943299465b84e9215562 Author: FUJITA Tomonori Date: Wed May 19 01:57:11 2010 +0900 use gcc built-in ffs() Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 082ac80690d5a56a0c24b5efe74af95574d65b67 Author: Steven Dake Date: Mon May 17 09:37:17 2010 -0700 shadow problem: info shadowed inside function defining shadow as local Rename info to info_old for the scope it is accessed in. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 65f9f5f2bb36661024e4762fd80ebe76ed7cda24 Author: Steven Dake Date: Mon May 17 09:37:15 2010 -0700 shadow problem: index is defined in but used as a local in a function Rename variable index to idx. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit b77afa9bf46c6e6a394c9e718437a658cd6c0a47 Author: Steven Dake Date: Mon May 17 09:37:14 2010 -0700 shadow problem: time() is shadowed by a local time[] rename the local variable time to time_str. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit e87f63941b409aca5d3fda607f71a69aa59518ef Author: Steven Dake Date: Mon May 17 09:37:13 2010 -0700 const correctness: retype parameters to tgetstr collie passes const char * parameters to tgetstr, but tgetstr expects non const parameters. In this case, retyping is safe. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 8e44ab525ccd38d855ef1a61757937e004b9d43a Author: Steven Dake Date: Mon May 17 09:37:12 2010 -0700 const correctness: make short_options a const char * getopt_long expects a const char * for third argument, not a char *. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit dbdd24b795eb2ddb65582358a132bb850ddeb8bb Author: Steven Dake Date: Mon May 17 09:37:11 2010 -0700 const correctness: char units[] is really const char units[] This patch changes the constant units[] to const char. Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit f35c5d5d7deb8bd15e6c16669f825ce309e7b8c9 Author: Steven Dake Date: Mon May 17 09:37:10 2010 -0700 const correctness: commands[] change char to const char * This first parameter is really a (const char *) instead of a (char *). Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit 7b1ff7211fae24a936eec1255d30fb3bba26243c Author: Steven Dake Date: Mon May 17 09:37:09 2010 -0700 const correctness: subcommand struct The first subcommand struct member (char *) is really a (const char *). Signed-off-by: Steven Dake Signed-off-by: MORITA Kazutaka commit b1941caae8ec1e7797ca8d8290ddcd0278f14c57 Author: MORITA Kazutaka Date: Wed May 12 18:24:47 2010 +0900 collie: read entire object when listing vdi To calculate the number of allocated object, we need to read the data_vdi_id field, so we must read the entire object. Signed-off-by: MORITA Kazutaka commit f7e0c8ca363dfddaf4e2cefe0932c2f62c128d26 Author: MORITA Kazutaka Date: Wed May 12 17:02:17 2010 +0900 move some request result codes from sheep.h to sheepdog_prog.h Qemu can get these result codes. Signed-off-by: MORITA Kazutaka commit 4f5570d7ee4933eebd7de88a756bbc2a6b7ddcff Author: MORITA Kazutaka Date: Wed May 12 17:00:50 2010 +0900 collie: fix wrong syntax `switch' should be 'if` here. Signed-off-by: MORITA Kazutaka commit 8f0808bf15748ff30bcb6f33f3a6e9b8746d5da9 Author: MORITA Kazutaka Date: Wed May 12 16:52:33 2010 +0900 sheep: reject requests if the target node is not the sheepdog member Signed-off-by: MORITA Kazutaka commit 2932a13ae67b61847acf0f1c9e5cbc050cf8a2ee Author: MORITA Kazutaka Date: Wed May 12 16:09:23 2010 +0900 change program names Currently, program names in Sheepdog project are: sheepdog daemon : collie sheepdog admin tool : shepherd Collie is sheepdog, and shepherd is a job name to take care of sheep. Sheepdog daemon is running on each node, so in this case, there are many sheepdogs and one shepherd in the cluster. But, if we take into account our project name, there should be many sheeps and one sheepdog. This patch changes the names to as follows: sheepdog daemon : sheep sheepdog admin tool : collie Signed-off-by: MORITA Kazutaka commit 2e2f91fe01ba81e5789f1ab10bd1742b4f6ef90f Author: MORITA Kazutaka Date: Wed May 12 14:55:03 2010 +0900 collie: fix IO request retry Signed-off-by: MORITA Kazutaka commit 50979b1e6025d40bb3183d0150d30904dfc93ae9 Author: MORITA Kazutaka Date: Wed May 12 14:52:29 2010 +0900 collie: fix condition to start recovery Signed-off-by: MORITA Kazutaka commit 0cb71d2fd830eb60e2fee442d4851904de020add Author: MORITA Kazutaka Date: Tue May 11 20:37:13 2010 +0900 collie: setup cluster_info correctly when collie restarts Signed-off-by: MORITA Kazutaka commit d422c8b7b0c5a34c82ca0218c627f812218784bb Author: MORITA Kazutaka Date: Tue May 11 19:58:32 2010 +0900 collie: fix recovery retry bug When there are pending recovery works, we must stop the current recovery work and go to the next one. Signed-off-by: MORITA Kazutaka commit d207df00a2506f25f581b1a3dd16235394307351 Author: MORITA Kazutaka Date: Tue May 11 19:26:51 2010 +0900 collie: receive cluster creation time when node joins Signed-off-by: MORITA Kazutaka commit d16c1da241d8f0e200c01a5c30f34c3f2ee8d89b Author: MORITA Kazutaka Date: Tue May 11 18:56:51 2010 +0900 collie: fix object recovery when adding nodes This fixes the calculation of the target node to recovery objects. Signed-off-by: MORITA Kazutaka commit bdb361fc8090a181a39077b755f444f1aae93b69 Author: MORITA Kazutaka Date: Tue May 11 17:22:03 2010 +0900 collie: properly set a epoch number This fixes the problem that newly joined nodes set a wrong epoch. Signed-off-by: MORITA Kazutaka commit 28be3a87c8b8ab854be1ca55853c90b91beea33d Author: MORITA Kazutaka Date: Tue May 11 17:16:10 2010 +0900 collie: fix double call of join() Signed-off-by: MORITA Kazutaka commit 511be8bf54506fd4e5f9840e56a20265a3d45e5e Author: MORITA Kazutaka Date: Tue May 11 15:29:57 2010 +0900 collie: fix object list creation bug When the number of nodes is less than the number of replication, we cannot calculate the hash value based on sys->nr_sobjs. Signed-off-by: MORITA Kazutaka commit 1b6329123b5180580408f093dd886dee26b9d890 Author: MORITA Kazutaka Date: Tue May 11 14:08:38 2010 +0900 shepherd: update node list after parsing arguments Shepherd should get node list from the specified daemon port, so we cannot call update_node_list before parsing arguments. Signed-off-by: MORITA Kazutaka commit 27c9e7756cd71467d4cd5085c51cae3042f94bf3 Author: MORITA Kazutaka Date: Mon May 10 21:21:25 2010 +0900 collie: fix resume_pending_requests Signed-off-by: MORITA Kazutaka commit d2e42f2faa4c6112f6cdfd30beea035155519dbf Author: FUJITA Tomonori Date: Mon May 10 20:22:10 2010 +0900 add SD_DEFAULT_REDUNDANCY set SD_DEFAULT_REDUNDANCY to 3. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit fa4a0a50dd0b4f81918ea30e2ac3cb1233a82c33 Author: MORITA Kazutaka Date: Mon May 10 20:40:50 2010 +0900 collie: set the protocol version to the header Signed-off-by: MORITA Kazutaka commit bc9f0d5337d17d2ac72f7f182765ff0d3ba406a0 Author: MORITA Kazutaka Date: Mon May 10 20:31:02 2010 +0900 add helper function to return string describing error number Signed-off-by: MORITA Kazutaka commit 2200fba26049981241bd2571e40db762f6122bfe Author: MORITA Kazutaka Date: Mon May 10 19:44:44 2010 +0900 collie: remove unused operation SD_OP_SYNC_OBJ Signed-off-by: MORITA Kazutaka commit 2b0538be138f5983e8f127f881cf14cd8a3d77e3 Author: MORITA Kazutaka Date: Mon May 10 19:43:09 2010 +0900 remove unused status definition Signed-off-by: MORITA Kazutaka commit 2bc21697543fda85ffc132a94f583b8d9a7354cf Author: MORITA Kazutaka Date: Mon May 10 19:28:04 2010 +0900 shepherd: add header to the vdi list Signed-off-by: MORITA Kazutaka commit 466280531c52f902868496bc5932dcd3d218182d Author: MORITA Kazutaka Date: Mon May 10 19:23:12 2010 +0900 shepherd: improve vdi listing performance This patch does the following things: - reuse the socket discriptor during listing vdis - read only the part of the vdi objects Signed-off-by: MORITA Kazutaka commit a0c96eefa52cd20a14b37c36699993167fcf61eb Author: MORITA Kazutaka Date: Mon May 10 15:52:08 2010 +0900 collie: return SD_NO_SPACE when there is no free space Signed-off-by: MORITA Kazutaka commit 8f9b6b634444ef196718e8603cb0c086ad862202 Author: FUJITA Tomonori Date: Mon May 10 11:08:10 2010 +0900 shepherd: fix commands requiring the third argument Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 4abe3a9f61b44eb3f1c4856bfaf98b8aa02d7507 Author: MORITA Kazutaka Date: Mon May 10 07:14:22 2010 +0900 check protocol version We should avoid wrong version members coming in the cluster. Signed-off-by: MORITA Kazutaka commit 277f4adf7e0423422c3a5f2e950fbf65e8514195 Author: MORITA Kazutaka Date: Mon May 10 06:30:09 2010 +0900 remove unused request header fields Signed-off-by: MORITA Kazutaka commit 519f3dd0b675bb4260d096b2557e4ada7a659a16 Author: MORITA Kazutaka Date: Mon May 10 06:10:19 2010 +0900 clean up header files Header files about sheepdog protocols are grouped into three. include/sheepdog_proto.h - used between server and VM include/collie.h - used in the server (including shepherd) collie/collie_priv.h - internal to collie daemons Signed-off-by: MORITA Kazutaka commit b773f83ba53e560613f0dafe8f64dc6ac1337227 Author: MORITA Kazutaka Date: Mon May 10 04:02:13 2010 +0900 collie: remove wrong free() Signed-off-by: MORITA Kazutaka commit 867d230453ca88580a08c4854563dfc084efb90e Author: FUJITA Tomonori Date: Sun May 9 09:25:39 2010 +0900 add the bash completeion script for shepherd Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit a1bf95500c5be3d49bfd6e1284c500bb95d82f58 Author: FUJITA Tomonori Date: Sun May 9 03:23:47 2010 +0900 shepherd: refine command syntax fujita@rose:~/git/sheepdog$ ./shepherd/shepherd --help Usage: shepherd command subcommand [options] Sheepdog Administrator Utilty Command syntax: cluster (info|format|shutdown) node (info|list) vdi (list|delete|object|lock|release) vm list Common parameters: -p, --port specify the daemon port -h, --help display this help and exit Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit edbf6c09f284cb36ff389559131de920f255ea75 Author: MORITA Kazutaka Date: Sat May 8 11:01:16 2010 +0900 collie: remove atomic vdi object update support We need to update vdi objects atomically to have tolerance against total node failure. However, adding checksum for all object update causes too bad performance. I think we should take another approach such as logging . Signed-off-by: MORITA Kazutaka commit fc4f552866bcebd07f86a2829a59076298229203 Author: MORITA Kazutaka Date: Fri May 7 20:37:45 2010 +0900 collie: reject invalid node joining to the sheepdog When joining nodes have a wrong epoch information, sheepdog should reject the nodes and return the error result. Signed-off-by: MORITA Kazutaka commit 67ceac07128342d574eb56c3ddf95b38d71f558a Author: MORITA Kazutaka Date: Fri May 7 18:06:17 2010 +0900 collie: remove epoch request stuff Signed-off-by: MORITA Kazutaka commit 87132166d5ad64fc48729936f09d04ea7b5c4a3b Author: MORITA Kazutaka Date: Fri May 7 18:47:07 2010 +0900 collie: send epoch information with join_message Currently, the master node requests epoch information to the joining nodes to decide whether they can join or not. This patch makes the joining nodes send local epoch information with their join messages. Signed-off-by: MORITA Kazutaka commit 275db4d91351f08d25a3e64b4c183410840d88c7 Author: FUJITA Tomonori Date: Fri May 7 14:39:59 2010 +0900 retry indirect I/O requests if they fail due to epoch mismatch If indirect I/O requests fail due to epoch mismatch, we need to set up the epoch and node list of them and retry them. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 09afc801d5e8d5f24caec136658f3aee351e521b Author: FUJITA Tomonori Date: Fri May 7 14:26:08 2010 +0900 fix failed request handling in start_cpg_event_work req->done could modify cpg_event_siblings so we can't call req->done in the cpg_event_siblings loop. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit f13360df093bb459d09dea82edd4f1de67ab45f1 Author: FUJITA Tomonori Date: Fri May 7 14:15:50 2010 +0900 avoid the race between recovery and IO requests - we can't perform IO requests against the object that we are recovering. - we can't recover the object that we are performing IO requests against. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 619e60282bdb9cadb316c0d8f93d22c826f1f37e Author: FUJITA Tomonori Date: Fri May 7 14:15:49 2010 +0900 avoid the race of object access from multiple IO requests We need to avoid performing multiple IO requests to the same object. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit b701e1ab674e22ea8b17b61331898e0774b13e4a Author: FUJITA Tomonori Date: Thu May 6 20:43:12 2010 +0900 remove access to sys->sd_node_list in thread We can't access to sys->sd_node_list in thread. Note that add_vdi and lookup_vdi are safe since these operations are serialized with cpg_event. So epoch and sd_node_list don't change during these operations. TODO: fix del_vdi() that accesses to sys->sd_node_list in thread in the unsafe way. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 45d140e2f3765027b4d1e44c253c3f3b095acf69 Author: FUJITA Tomonori Date: Thu May 6 19:45:15 2010 +0900 remove unused exec_reqs helper function Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit f19d2e86dd6f69d844cc166696eafa7be415175a Author: FUJITA Tomonori Date: Thu May 6 19:45:37 2010 +0900 remove access to sys->epoch in threads We can't access to sys->epoch in threads. Use hdr->epoch instead. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 0c89f27c34eb0fc01dcf3143dc53a5c2bb59efa5 Author: FUJITA Tomonori Date: Thu May 6 19:45:36 2010 +0900 move epoch checking to start_cpg_evet_work from queue_request We can't access to sys->epoch after start_cpg_event_work (via work threads). Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit c174c06991e61ae35a8a19202cdb577160aefeb8 Author: FUJITA Tomonori Date: Thu May 6 17:12:08 2010 +0900 rename SD_FLAG_CMD_FORWARD to SD_FLAG_CMD_DIRECT "SD_FLAG_CMD_FORWARD" name is confusing since we need to use "SD_FLAG_CMD_FORWARD" for no-forwarded requests (for recovery, etc). This patch renames SD_FLAG_CMD_FORWARD to SD_FLAG_CMD_DIRECT. SD_FLAG_CMD_DIRECT means that the sender wants to send a request to a node "directly". Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 16869111e98ffdfa0f1de6c71d43e5a167ba1410 Author: FUJITA Tomonori Date: Thu May 6 17:12:09 2010 +0900 use SD_FLAG_CMD_DIRECT for requests for VDI This also fixes a bug in write_object() that doesn't send the proper number of requests. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit e274109d78029598a9badddbfa4170d45e4d1a9b Author: FUJITA Tomonori Date: Thu May 6 16:04:54 2010 +0900 remove useless semicolon Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 62785ed9b95288de47abfc84d762bdbc9ffc2acc Author: FUJITA Tomonori Date: Thu May 6 16:06:23 2010 +0900 fix cpg_event leak Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 859c9fdd27bb73e1394323c809140c2803aae941 Author: FUJITA Tomonori Date: Thu May 6 16:06:27 2010 +0900 stop performing cpg events during io requests Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 0d4b9fa08a268e7a6a45f6e91f3841d68619ec4d Author: FUJITA Tomonori Date: Thu May 6 16:06:26 2010 +0900 stop performing requests during JOIN or confchg events Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit ccd93b7ee782e2d7a4b94999a1785c39ca4a9abc Author: FUJITA Tomonori Date: Thu May 6 16:06:25 2010 +0900 call queue_work for request to start_cpg_event_work preparation of executing requests and cpg events in the proper order. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit e01dc7d8be9a29bfb35ebf74a5dc8f05da0bb12d Author: FUJITA Tomonori Date: Thu May 6 16:06:24 2010 +0900 move cgp_event struct to collie.h we want the cpg event thread to handle request struct Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit b352117553ceab9184b3af40d37517fb68fa51df Author: MORITA Kazutaka Date: Wed May 5 04:03:35 2010 +0900 split SD_STATUS_STARTUP There are two independent state in SD_STATUS_STARTUP: - collie is waiting for a format opereation - collie is waiting for other nodes joining to the cluster so we should split it. This patch introduces SD_STATUS_WAIT_FOR_FORMAT and SD_STATUS_WAIT_FOR_JOIN. Signed-off-by: MORITA Kazutaka commit b887873ffe47bb215f8e343057b133f7d73049ec Author: MORITA Kazutaka Date: Wed May 5 01:45:48 2010 +0900 collie: fix collie starting with empty epoch directory If the empty epoch directory is already in the store directory, collie fails to start up without much information. This occurs, for example, when we stop collies before executing mkfs. This patch continues starting up in the such case. Signed-off-by: MORITA Kazutaka commit fa78a64c4ba6a16029617a95519cf430ee045661 Author: MORITA Kazutaka Date: Tue May 4 17:45:51 2010 +0900 collie: make the master node read global_nr_copies in __sd_confchg When there is only one node in the sheepdog cluster, the master node doesn't get join messages, so it should read the global number of object copies by itself. Signed-off-by: MORITA Kazutaka commit f705d7478521f12dfa57fad7f090f788df867eab Author: MORITA Kazutaka Date: Fri Apr 30 08:30:31 2010 +0900 fix wrong alignment Signed-off-by: MORITA Kazutaka commit 680946bd1fa6d0411740df6b844bc0a5f56a5c10 Author: MORITA Kazutaka Date: Fri Apr 30 08:19:05 2010 +0900 shrink vdi object size 64 bit oid field in the struct sheepdog_inode is redundant, because we can derive its lower 32 bit. This patch reduces the vdi object size and increases metadata operation performance. Signed-off-by: MORITA Kazutaka commit 5ed728061e4a58b464617f82b29bdd833e606438 Author: MORITA Kazutaka Date: Wed Apr 28 17:58:55 2010 +0900 collie: fix partial vdi object update bug When clients update vdi objects partially, the objects may be broken if collies fail to update the object files, so we need to check whether their contents are correct or not. Signed-off-by: MORITA Kazutaka commit 1c0ea0d9efb1a0d45a44eaa8c9490daa14f72e25 Author: MORITA Kazutaka Date: Tue Apr 27 19:01:15 2010 +0900 update README Signed-off-by: MORITA Kazutaka commit 509e05b0072775a07f41f6734c5c1d4c1cdba34e Author: MORITA Kazutaka Date: Tue Apr 27 18:34:33 2010 +0900 update copyright year Signed-off-by: MORITA Kazutaka commit b39def675ddd7db502994dd1f4e767bd9058503d Author: FUJITA Tomonori Date: Fri Apr 23 08:38:45 2010 +0900 remove unnecessary access to sd_list in store_queue_request() queue_request() checks if the node is ready to accept a request. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 428052ff74e8533c5d9adba47410857ae59f4ca2 Author: MORITA Kazutaka Date: Fri Apr 23 02:13:55 2010 +0900 use calloc instead of malloc + memset This also silences make check warnings. Signed-off-by: MORITA Kazutaka commit cbce1e037873cf95bed81a7a48a031c794d17286 Author: FUJITA Tomonori Date: Thu Apr 22 20:30:13 2010 +0900 silence gcc warning looks like the warning is bogus. Just silence gcc Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 6f5120dc7f72ef4068b7a3fd28b6e8ecb6292298 Author: OZAWA Tsuyoshi Date: Fri Apr 23 02:46:00 2010 +0900 refactor codes to pass semantic check by sparse Signed-off-by: OZAWA Tsuyoshi Signed-off-by: MORITA Kazutaka commit 562bb468c84c2755f9cf55fac609e8233b72ea33 Author: OZAWA Tsuyoshi Date: Fri Apr 23 02:38:38 2010 +0900 add semantic check by using sparse Sparse is a semantic parser used in Linux kernel. This patch provide sheepdog with semantic check. To run sparse, $ make check If script/checkarch.sh failed at running "make check", run $ make check32 or $ make check64 instead. Signed-off-by: OZAWA Tsuyoshi Signed-off-by: MORITA Kazutaka commit 8855b95578fb3cb42d32ce5e820ce4743628f0ee Author: FUJITA Tomonori Date: Thu Apr 22 16:19:45 2010 +0900 simplify work_queue we don't use multiple queues (and unlikely use it in the future). Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 779873990b4afeb6b93c5013a134177fe5de2b01 Author: FUJITA Tomonori Date: Thu Apr 22 15:29:41 2010 +0900 call start_recovery only when sheepdog nodes leave We don't need to call start_recovery when nodes on cpg_list leave (that haven't finish the JOIN procedure). Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit ca48a825d95fa7118b5c064e959493c674ba54ec Author: MORITA Kazutaka Date: Thu Apr 22 16:18:14 2010 +0900 shepherd: fix showing wrong vdi size Signed-off-by: MORITA Kazutaka commit 0aba84217723db0ab7f958e9f86fd8b2ac92afeb Author: MORITA Kazutaka Date: Thu Apr 22 15:24:14 2010 +0900 collie: merge recover_one_done and __start_recovery_done These two functions have a lot in common, so let's merge them. Signed-off-by: MORITA Kazutaka commit c0db26e05232d4cfaee746c27209ccdd9186f87f Author: FUJITA Tomonori Date: Thu Apr 22 11:02:03 2010 +0900 inform new nodes of the running vm state When a node joins in Sheepdog, the master node informs it of the running vm state. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 4bf254673c8412e3c47216d7e65fe26011728b8a Author: FUJITA Tomonori Date: Thu Apr 22 09:21:57 2010 +0900 make cpg_event_fn and cpg_event_done static Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 01ed3d4274b809fc0a2f9eaf11492d856f0119c8 Author: FUJITA Tomonori Date: Wed Apr 21 18:05:57 2010 +0900 split __sd_confchg - __sd_confchg looks too large. This cleans up it with a new helper function, for_each_node_list(). - if we fail to add a new node to oom, we can't continue. TODO: we need to call panic() in some failure in del_node(). Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit e6556d15fcb0ec9016bcacf2728db80f7388d584 Author: FUJITA Tomonori Date: Wed Apr 21 15:12:18 2010 +0900 kill the daemon in the case of confchg oom If we can't allocate memory for a confchg event, we can't continue. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 69cf9bdf72ae695d6a6613630f7a8a820040efa0 Author: FUJITA Tomonori Date: Wed Apr 21 15:12:16 2010 +0900 kill so_queue_request in collie.h The function was removed. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit cfda78f76e0d9ec3131112ac1e3a5780da7b5af1 Author: FUJITA Tomonori Date: Wed Apr 21 15:12:17 2010 +0900 add panic() We should kill the daemon in the case of a critical error to avoid the worth events like data corruption. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 7de3934160940551893eafa82b660a2a1b1fecb2 Author: FUJITA Tomonori Date: Wed Apr 21 15:12:15 2010 +0900 rename sd_confch to sd_confchg 'confchg' name is commonly used since cpg_callbacks_t use it: typedef struct { cpg_deliver_fn_t cpg_deliver_fn; cpg_confchg_fn_t cpg_confchg_fn; } cpg_callbacks_t; Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit baa33fd0f8c4c96e0337ee272a8c030193b85fd4 Author: MORITA Kazutaka Date: Wed Apr 21 13:24:20 2010 +0900 collie: remove unused member from struct recovery_work Signed-off-by: MORITA Kazutaka commit fbc56e97c390f169a837608dd884d4134383f0e1 Author: FUJITA Tomonori Date: Wed Apr 21 13:03:47 2010 +0900 remove unused sys->node_list_idx Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit d8838b21148e74920e362efd9280dee092ff890b Author: FUJITA Tomonori Date: Wed Apr 21 13:03:46 2010 +0900 rename obscure sys->synchronized name We use sys->synchronized to see if the node finishes the JOIN procedure or not. Let's use more clear name, join_finished. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 1f432b7f238b3e37dcde808dee479413792cb3cc Author: OZAWA Tsuyoshi Date: Mon Apr 19 17:40:52 2010 +0900 add vm_clock_nsec and vm_state_size for live snapshot Add vm_clock_nsec and vm_state_size to sheepdog_inode for live snapshot. To work live snapshot patch correctly, this patch is needed. Signed-off-by: OZAWA Tsuyoshi Signed-off-by: MORITA Kazutaka commit 946eccb72a3d78fe82c0fd2a3e7800c402a4b142 Author: FUJITA Tomonori Date: Wed Apr 21 10:06:18 2010 +0900 serialize all cpg events There is a bug that deliver and confchg events are not performed in order. - we link all the cpg events to sys->cpg_event_siblings. - the above events are performed serially. - we need to ignore deliver events until we join sheepdog. - we can't use WORK_ORDERED since it blocks qemu I/Os. All cpg events are serialized so we don't need it. - we can't call join() and vdi_op() that can sleep for long time in the main process. - we need to think about two cases when we finish __sd_deliver_done() for a INIT message; a) we already got for the FIN for it or we haven't so we need to suspend the cpg event execution. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit f0f4eb542fb386f4f971644ad6d10fefdacbcce2 Author: MORITA Kazutaka Date: Mon Apr 19 02:37:54 2010 +0900 collie: fix object consistency when qemu died during write operation If qemu dies during write object operation, consistency of the replicated objects would be broken. This patch fixes the broken consistency in the object recovery routines. Signed-off-by: MORITA Kazutaka commit 3a6fa427e8113e6208e0db9e37c336d06627376d Author: MORITA Kazutaka Date: Sun Apr 18 07:49:52 2010 +0900 collie: release vdi lock when qemu died with its host machine Signed-off-by: MORITA Kazutaka commit 7b4e7c7eb9f1b90377666cddba6290ec473fd73e Author: MORITA Kazutaka Date: Sat Apr 17 23:28:32 2010 +0900 collie: add asynchronous write support This patch uses poll when collie forward write requests to multiple nodes. Signed-off-by: MORITA Kazutaka commit 5239d93f9187a40fb9995dbd348319cf71bc667e Author: MORITA Kazutaka Date: Tue Apr 13 18:08:54 2010 +0900 collie: fix counting the number of worker threads Signed-off-by: MORITA Kazutaka commit 00f46e4433076d80e59435370ae1dda1302ae775 Author: MORITA Kazutaka Date: Tue Apr 13 17:25:52 2010 +0900 shepherd: fix memory violation when showing object information Signed-off-by: MORITA Kazutaka commit 9d5b33a2b28fd76121655a63d355153994006951 Author: MORITA Kazutaka Date: Tue Apr 13 17:25:42 2010 +0900 collie: set the seed for random() Signed-off-by: MORITA Kazutaka commit 318eff90ddbe2f300ff9ebf15e69a2bbf0e2b531 Author: MORITA Kazutaka Date: Tue Apr 13 16:28:00 2010 +0900 collie: update object list file atomically This is a temporary hack to update the object list file atomically. FNV hash value is added to the file as an extend attribute. Signed-off-by: MORITA Kazutaka commit 638f1985fec47265f4626c8461a8439820090ec6 Author: MORITA Kazutaka Date: Tue Apr 13 16:26:48 2010 +0900 collie: fix recovery thread race Signed-off-by: MORITA Kazutaka commit 6155ba2655e8812c90cee88b0d7447549b3c13f2 Author: MORITA Kazutaka Date: Wed Apr 14 14:04:01 2010 +0900 collie: enlarge logger buffer size 16 KB log buffer is too small, so let's enlarge it to 1 MB. Signed-off-by: MORITA Kazutaka commit a1caf5a90a73e1fede8b89a5bafa25d02325d174 Author: FUJITA Tomonori Date: Wed Apr 14 13:56:52 2010 +0900 fix ORDERED work handling bug fix a bug that A SIMPLE work wrongly passes blocked ORDERED works. 1. a SIMPLE work is on the pending_list 2. when a new ORDERED work comes, then it added to the blocked_list. 3. then a new SIMPLE work comes, it's wrongly added to the pending_list. It will be executed wrongly before the above ORDERED. It should be delayed untile the above ORDERED work finishes. Signed-off-by: FUJITA Tomonori Signed-off-by: MORITA Kazutaka commit 695d325192fc7388e8449979cc2f589ce137b0ef Author: FUJITA Tomonori Date: Mon Apr 12 23:28:55 2010 +0900 shepherd: fix debug lock_vdi and release_vdi Signed-off-by: FUJITA Tomonori commit b52d3ebab19b5eade0041f9956773352a1624e21 Author: FUJITA Tomonori Date: Mon Apr 12 23:28:55 2010 +0900 inform new nodes of the running vm state The master node needs to inform newly added nodes of the running vm state. Signed-off-by: FUJITA Tomonori commit 2d25761d743eb2f5d13c6851103e479b37940bec Author: FUJITA Tomonori Date: Mon Apr 12 23:23:02 2010 +0900 fix vdi object recovery looks like the recovery code wrongly assumes that the size of object is SD_DATA_OBJ_SIZE. Signed-off-by: FUJITA Tomonori commit 2a5866462293a530727338d5b14e4da6812d2fa8 Author: FUJITA Tomonori Date: Mon Apr 12 14:40:02 2010 +0900 remove unused argurement in add_node Signed-off-by: FUJITA Tomonori commit 60ef980c27cdf3ffb420225c4942041456b866a9 Author: FUJITA Tomonori Date: Mon Apr 12 14:40:03 2010 +0900 keep the list of unfinished deliver messages on non master nodes non master nodes need to keep the list of unfinished deliver messages in the case of master failure. Signed-off-by: FUJITA Tomonori commit 9feeb3f29a2e12478e11f036d7644557dc5c12cb Author: MORITA Kazutaka Date: Sun Apr 11 18:36:14 2010 +0900 vditest: script to test sheepdog virtual disk images This is a wrapper script for qemu-io and performs repeated I/O accesses to sheepdog VDI without starting any virtual machines. Its options are similar to disktest in the Linux Test Projcet test suit. See `vditest -h' for more information. Signed-off-by: MORITA Kazutaka commit f5ace113814da4d2e9f68d8a2823da7aa24f9228 Author: FUJITA Tomonori Date: Sat Apr 10 01:28:48 2010 +0900 simplify the usage of sd_node_list and cpg_node_list We currently do: 1. when we find the node on corosync, we allocate the node and add it to cpg_node_list. 2. when the node joins sheepdog (completes the JOIN process), we allocate the node and add it to sd_node_list. This patch simplifies the above: 1. when we find the node on corosync, we allocate the node and add it to cpg_node_list. 2. when the node joins sheepdog (completes the JOIN process), we move the node from cpg_node_list to sd_node_list. The node is on cpg_node_list _OR_ sd_node_list. I thought about managing the nodes on a single list but the code doesn't look simpler than this approach. I also add some comments. Signed-off-by: FUJITA Tomonori commit 2d55536ad517558ef10c2bc3753e1ef5defc9b37 Author: MORITA Kazutaka Date: Sat Apr 10 10:35:39 2010 +0900 shepherd: fix showing wrong info when shepherd cannot connect to any node Signed-off-by: MORITA Kazutaka commit 6c49c9987004f920725f03506c523eb87d19557b Author: MORITA Kazutaka Date: Fri Apr 9 21:49:46 2010 +0900 collie: fix a vdi operation race When we send a vdi operaton to the collie, the following procedures are executed: (1) the operation message 'A' is sent to the corosync (2) __sd_deliver is called with w->msg == 'A' (3) the completion message 'B' is sent to the corosync (4) __sd_deliver_done is called with w->msg == 'A' (5) __sd_deliver is called with w->msg == 'B' (6) __sd_deliver_done is called with w->msg == 'B' These must be called in this order. This patch ensures that (4) is called before (5). Signed-off-by: MORITA Kazutaka commit 391af9abf9cbcea51104d491565d5201eacf175c Author: FUJITA Tomonori Date: Fri Apr 9 16:10:45 2010 +0900 remove unnecessary sd_node_list access Signed-off-by: FUJITA Tomonori commit ef6cf2b4c745a7a0e7b220b1c109635329d895ad Author: FUJITA Tomonori Date: Fri Apr 9 16:10:44 2010 +0900 remove needless sd_node_list access Signed-off-by: FUJITA Tomonori commit 0ed0bf3de858c43da4bad3c431c408cd44ce6481 Author: FUJITA Tomonori Date: Fri Apr 9 16:10:43 2010 +0900 remove useless sd_node_list checking in is_master() - for non masters, if sys->synchronized is non zero, sys->sd_node_list should not be empty. - for the master, confchg is called before deliver. confchg sets sys->synchronized and adds itself to sd_node_list. Signed-off-by: FUJITA Tomonori commit fcc3681571cc6e56d79bd88ce3a9ffbf31dba1db Author: FUJITA Tomonori Date: Fri Apr 9 16:10:42 2010 +0900 add get_ordered_sd_node_list helper function Signed-off-by: FUJITA Tomonori commit 6d9dcd67a86e8b6c7a8860550de5d5a6c9dba4cf Author: FUJITA Tomonori Date: Fri Apr 9 16:10:41 2010 +0900 remove the useless check in join() Only the master calls join() and the master sets sys->synchronized before calling join(). Signed-off-by: FUJITA Tomonori commit b7aa6d9646b703dd9009ea8794df841b4dfe6da8 Author: FUJITA Tomonori Date: Fri Apr 9 16:10:40 2010 +0900 remove useless code in update_cluster_info sd_node_list should be empty here. Signed-off-by: FUJITA Tomonori commit 4eed7d7a4a4fae358fabdc9fd9f48673205406cb Author: FUJITA Tomonori Date: Fri Apr 9 16:01:00 2010 +0900 make print_node_list() macro We can know exactly who call print_node_list(): Apr 09 12:27:45 update_cluster_info(551) nodeid: 1004c0a, pid: 20692, ip: 10.76.0.1:7002 Apr 09 12:27:45 update_cluster_info(551) nodeid: 1004c0a, pid: 20701, ip: 10.76.0.1:7003 Apr 09 12:27:45 update_cluster_info(551) nodeid: 1004c0a, pid: 20710, ip: 10.76.0.1:7004 Apr 09 12:27:45 update_cluster_info(551) nodeid: 1004c0a, pid: 20719, ip: 10.76.0.1:7005 Apr 09 12:27:45 update_cluster_info(551) nodeid: 1004c0a, pid: 20728, ip: 10.76.0.1:7006 Apr 09 12:27:45 update_cluster_info(551) nodeid: 1004c0a, pid: 20737, ip: 10.76.0.1:7007 Apr 09 12:27:45 update_cluster_info(551) nodeid: 1004c0a, pid: 20746, ip: 10.76.0.1:7008 Signed-off-by: FUJITA Tomonori commit 51f7d2a9f0be017168e79e1048bab0293d7599da Author: MORITA Kazutaka Date: Fri Apr 9 14:42:57 2010 +0900 collie: fix object read from the local node Signed-off-by: MORITA Kazutaka commit e276c7002ac3a25d12dd416e3355cfca6ca68e57 Author: MORITA Kazutaka Date: Fri Apr 9 05:05:54 2010 +0900 fix calculation of data object id We need to mask VDI_BIT to calculate data object id. Signed-off-by: MORITA Kazutaka commit e88252368e43db3011b6279a65d8cf3df8bf67c0 Author: MORITA Kazutaka Date: Fri Apr 9 03:28:10 2010 +0900 support vdi deletion This patch adds preliminary support for vdi deletion. Usage: shepherd delete vdiname [-i snapshot_id] '-i' argument is required when you want to delete snapshot. Note: Currently, there is a restrictoin on this feature. Data objects are not reclaimed until all vdis with same name are deleted. i.e. name id size used shared creation time object id -------------------------------------------------------------------- windows 0 4 GB 2 GB 0 MB 2010-01-09 15:01 80000 linux 0 4 GB 1 GB 1 GB 2010-01-09 15:02 c0000 s linux 1 4 GB 2 GB 0 MB 2010-01-09 15:01 40000 To free space used by the linux vdi, you must run the following two. $ shepherd delete linux $ shepherd delete linux -i 1 Running only either of them doesn't cause object reclaiming though the vdi disappears from the output of vdi listing. If the vdi is cloned from another vdi, you also need to delete all the base vdi to reclaim data objects. Signed-off-by: MORITA Kazutaka commit 5ef8f83ed85dfa2b0921071e847dfee69388d55f Author: FUJITA Tomonori Date: Fri Apr 9 03:03:43 2010 +0900 fix master selection race We wrongly assume that the node that joins corosync sends Sheepdog's JOIN message before the other nodes do. We hit a bug that two nodes are the master node temporarily. This patch makes sure that the node that joins corosync first will be always the master. Signed-off-by: FUJITA Tomonori commit 52432dbdaa94f22685222155bd28d18b56634ca1 Author: FUJITA Tomonori Date: Fri Apr 9 02:56:31 2010 +0900 include linux/limits.h for PATH_MAX The commit 3ce47e27cf91476328fc4b19e65588947738fb19 breaks the tree on Ubuntu 9.10: http://lists.wpkg.org/pipermail/sheepdog/2010-April/000308.html Reported-by: Wido den Hollander Tested-by: Wido den Hollander Signed-off-by: FUJITA Tomonori commit 1edc1f06795fcc598c0255efbbeed5ab50c5745c Author: FUJITA Tomonori Date: Thu Apr 8 16:31:31 2010 +0900 use semop instead of setimedop seems that we lost too many log messages. it's not good to let the main daemon sleep for long time but let's see how it works. Signed-off-by: FUJITA Tomonori commit 8f0f5a8222b0ee745ef6b8970e507e89d513f8f6 Author: MORITA Kazutaka Date: Thu Apr 8 10:55:57 2010 +0900 shepherd: fix stack overflow when getting object information Signed-off-by: MORITA Kazutaka commit 3ce47e27cf91476328fc4b19e65588947738fb19 Author: FUJITA Tomonori Date: Thu Apr 8 12:51:01 2010 +0900 collie: stop using syslog - distributions save a syslog in different places. - only root can access to a syslog - when you run multiple collie daemons on the single host, it's difficult to read a single log file. So this patch makes collie daemon use the own log file like: fujita@rose:~/git/sheepdog$ ./collie/collie /tmp/u/ fujita@rose:~/git/sheepdog$ ls -l /tmp/u total 12 -rw-r--r-- 1 fujita fujita 1103 2010-04-07 18:09 collie.log drwxr-x--- 2 fujita fujita 4096 2010-04-07 18:09 epoch drwxr-x--- 2 fujita fujita 4096 2010-04-07 18:09 obj Signed-off-by: FUJITA Tomonori commit be8cd4eda3d67601e6f0e24fccdcc12588e9bed2 Author: MORITA Kazutaka Date: Tue Apr 6 17:57:22 2010 +0900 shepherd: add kill option for debugging This patch supports killing one node in the sheepdog cluster. Usage: $ shepherd debug -o kill [hostname] Signed-off-by: MORITA Kazutaka commit 26507f06f9bf29f27c4c5c31514d5d0dd48d3413 Author: FUJITA Tomonori Date: Tue Apr 6 14:16:02 2010 +0900 collie: remove unused master_node in struct join_message Signed-off-by: FUJITA Tomonori commit a5bd0a7f7a94836cb9e77673326532224671e38b Author: FUJITA Tomonori Date: Tue Apr 6 11:33:01 2010 +0900 collie: silence semaphore warning with multiple local collie daemons We get semaphore warnings if multiple collie daemons run on the local host. Signed-off-by: FUJITA Tomonori commit 8ced65077427acf91dc310d7958dbbda861d81e2 Author: MORITA Kazutaka Date: Tue Apr 6 03:14:45 2010 +0900 collie: handle double node failure Currently, sheepdog cannot handle node failure when recovery is invoked, so double node failure at once leads to system down. This patch supports it. Signed-off-by: MORITA Kazutaka commit 2e8d8e69dce1fbe4aaff46927baf8582948ce021 Author: MORITA Kazutaka Date: Tue Apr 6 02:52:04 2010 +0900 collie: fix node addition bug Newly added machine needs to get current epoch number from the master if cluster has already started. Signed-off-by: MORITA Kazutaka commit 5330dfe6fef4f2d11a23ba059a1ab2c0116c4a02 Author: FUJITA Tomonori Date: Mon Apr 5 21:02:09 2010 +0900 collie: add is_myself helper function we use various ways to see if a node entry is myself or not. So let's add a helper function to do it. Signed-off-by: FUJITA Tomonori commit 9b68eeffeff7f3473a84336c9e3813f3e0034ba2 Author: FUJITA Tomonori Date: Thu Apr 1 14:46:07 2010 +0900 collie: shutdown command kills the daemon Signed-off-by: FUJITA Tomonori commit 3a610471a7a1c2c009658c5254f07b7f39e8391e Author: FUJITA Tomonori Date: Mon Mar 29 00:06:36 2010 +0900 store nodeid persistently collie calculates the nodeid every time from the ip address it boots up. This patch stores the nodeid persistently instead. Signed-off-by: FUJITA Tomonori commit 06c28e3676f287cafa3a7328ed0455631357ae0b Author: FUJITA Tomonori Date: Sun Mar 28 21:32:17 2010 +0900 get ip address from corosync Signed-off-by: FUJITA Tomonori commit a9fc5d9ae59463369c335be5f7a4c97f1a3a4b91 Author: FUJITA Tomonori Date: Sun Mar 28 07:56:04 2010 +0900 add reboot support without the super object Signed-off-by: FUJITA Tomonori commit 8ac1dfab55e1ead9267aa195bda2c95945a56065 Author: FUJITA Tomonori Date: Fri Mar 26 10:17:21 2010 +0900 remove the super object code Signed-off-by: FUJITA Tomonori commit e6b2532436800ad0e8953b76a56952fb061f6e5f Author: FUJITA Tomonori Date: Fri Mar 26 10:17:20 2010 +0900 manage VDIs without the super object Signed-off-by: FUJITA Tomonori commit 6c0c3bc7394854a6c5938aba45d5ad62a7b2f9b6 Author: FUJITA Tomonori Date: Mon Mar 15 02:57:25 2010 +0900 add log level support Supports the following eight log levels: SDOG_EMERG 0 SDOG_ALERT 1 SDOG_CRIT 2 SDOG_ERR 3 SDOG_WARNING 4 SDOG_NOTICE 5 SDOG_INFO 6 SDOG_DEBUG 7 Note that they correspond to syslog message levels. A new logging function, vprintf supports the above. vprintf works like printk in linux kernel: vprintf(SDOG_NOTICE "Sheepdog daemon (version %s) started\n", SD_VERSION); like printk, you can use vprintf without SDOG_ level (used SDOG_INFO by default), however, it's a good idea to specify the level at all times. By default, collie prints messages that are more important than INFO. You can change the default log level with "loglevel" boot option. Signed-off-by: FUJITA Tomonori commit 279a33d3eca98424979b81e11e7a9e9d38d3d292 Author: MORITA Kazutaka Date: Fri Mar 12 12:45:13 2010 +0900 shepherd: support getting cluster status `shepherd info -t cluster` will show cluster status and current epoch information. Signed-off-by: MORITA Kazutaka commit da3b66b3944c4c152f3a782ed2fdc0388e943b48 Author: MORITA Kazutaka Date: Fri Mar 12 12:39:42 2010 +0900 collie: support sheepdog shutdown and restart Conditions sheepdog can start automatically and safely are: - all nodes have same epochs, which means `shepherd shutdown` was executed before shutting down sheepdog nodes - newly added machines have no local epochs and objects Otherwise collie returns SD_RES_INCONSISTENT_EPOCH error. Signed-off-by: MORITA Kazutaka commit f18a0a743b4b735c1bed25c2b5d915776c7a837d Author: MORITA Kazutaka Date: Wed Mar 10 17:04:39 2010 +0900 collie: fix alignment Signed-off-by: MORITA Kazutaka commit c43441bbef00ac3aa059677f5b526529777c553f Author: MORITA Kazutaka Date: Wed Mar 10 16:49:45 2010 +0900 collie: support safely shutdown Signed-off-by: MORITA Kazutaka commit aea612cca46cd8689c9e9f4296eb8323d96f1f67 Author: MORITA Kazutaka Date: Wed Mar 10 15:34:53 2010 +0900 save cluster creation time to the epoch dir All nodes need to know when make fs is invoked to support safe start up. Signed-off-by: MORITA Kazutaka commit 0f9669a09dcd430bf3c369139ecaf99a4f09f992 Author: MORITA Kazutaka Date: Wed Mar 10 15:31:02 2010 +0900 collie: fix start up Collie shouldn't join cpg members until it can surely start up. Signed-off-by: MORITA Kazutaka commit abc628234b75a382b1f33af7998303d9f23ec277 Author: MORITA Kazutaka Date: Wed Mar 10 15:13:34 2010 +0900 fix node list bug We need to separate a node list into sheepdog node list and corosync node list because corosync leave messages may come during sheepdog is processing join messages and it will break node list information. Signed-off-by: MORITA Kazutaka commit 3c7269844a48afa249aabea32a1eb2136b43e5a6 Author: MORITA Kazutaka Date: Wed Mar 10 15:07:42 2010 +0900 add addr_to_str to format address easily Signed-off-by: MORITA Kazutaka commit f23f2286bc8350ef750a84ee682436a64f8d87bd Author: MORITA Kazutaka Date: Tue Mar 9 19:42:09 2010 +0900 shepherd: fix displaying object information Signed-off-by: MORITA Kazutaka commit 78e93568df16fb0ed32ed45d1e5b6025617abbf0 Author: FUJITA Tomonori Date: Mon Feb 15 21:35:00 2010 +0900 make the cluster_info global We will ever one cluster_info and it should be global. Passing it around just makes the code complicated needlessly. Let's make the cluster_info global. Signed-off-by: FUJITA Tomonori commit 88e7a4e9e23f47fd580703c72205fdb2d78dc7c8 Author: FUJITA Tomonori Date: Mon Feb 15 19:41:50 2010 +0900 work: fix the usage of work structure after freeing work->done hook could free the work structure so we can't use the work structure. Due this bug, we might not call work_queue_clear_blocked() then collie daemon is completely blocked. Signed-off-by: FUJITA Tomonori commit ae5be73aa60c251ce7354c70cf31ee11f13c67f6 Author: MORITA Kazutaka Date: Mon Feb 15 14:43:18 2010 +0900 shepherd: use appropriate units in the output `shepherd info` This patch make the output of `shepherd info` more human readable. Signed-off-by: MORITA Kazutaka commit ff5a9ac9761350f7a118dd49e6aa8e9ecc679437 Author: MORITA Kazutaka Date: Mon Feb 15 11:18:38 2010 +0900 collie: fix a calculation of disk usage Signed-off-by: MORITA Kazutaka commit de60024928aa50d3b4e1d51b205d3931462e2560 Author: MORITA Kazutaka Date: Fri Feb 12 15:34:37 2010 +0900 collie: set SO_LINGER socket option To avoid exhausting local ports, we set the linger time to zero. NOTE: This change may cause a problem. I guess we should create socket discriptor pools for each thread, and reuse them. Signed-off-by: MORITA Kazutaka commit d993db32a0a93957ed8aca47ba57bbf6cdd70560 Author: MORITA Kazutaka Date: Fri Feb 12 15:42:28 2010 +0900 collie: fix a calculation of free disk spaces Signed-off-by: MORITA Kazutaka commit e09df66813239501069ca519d96d7bdc85a93d00 Author: MORITA Kazutaka Date: Thu Feb 11 23:52:49 2010 +0900 collie: retry forwarding requests when network errors have occurred Write requests must be forwarded to all target nodes, so we cannot skip here. Signed-off-by: MORITA Kazutaka commit e9ed3f28b28a579c09032306d9dd300f66b1f454 Author: MORITA Kazutaka Date: Thu Feb 11 20:36:02 2010 +0900 collie: fix a socket discriptor leak Signed-off-by: MORITA Kazutaka commit c1078e1f86f52a0692fb36137183caada0089a6d Author: MORITA Kazutaka Date: Thu Feb 11 16:12:33 2010 +0900 collie: add more detailed error messages Signed-off-by: MORITA Kazutaka commit 8d62e63aee65f5e2ef6edb0496974bb0ae1dadce Author: MORITA Kazutaka Date: Sun Feb 7 05:18:43 2010 +0900 shepherd: clean up treeview routines Signed-off-by: MORITA Kazutaka commit 21e125b7d5c13557d40780d69aa31df4f1187e65 Author: MORITA Kazutaka Date: Thu Feb 4 20:17:39 2010 +0900 shepherd: fix a tree view of VDIs Signed-off-by: MORITA Kazutaka commit de13a04ca89dc882ce107dbba83d105e6f29de60 Author: MORITA Kazutaka Date: Sat Jan 9 22:58:29 2010 +0900 shepherd: change the output of vdi list simplify the output of `shepherd info -t vdi` because its line-length was too long. New output is name id size used shared creation time object id -------------------------------------------------------------------- windows 0 4 GB 0 MB 0 MB 2010-01-09 15:01 80000 linux 0 4 GB 0 MB 0 MB 2010-01-09 15:02 c0000 s linux 1 4 GB 0 MB 0 MB 2010-01-09 15:01 40000 Lines of snapshot vdis start with 's'. Signed-off-by: MORITA Kazutaka commit 259a8286afb29fee6948f65038de3a723f15f4b9 Author: MORITA Kazutaka Date: Tue Jan 26 17:06:43 2010 +0900 add version information The version string is auto-generated by git commit hash values. Signed-off-by: MORITA Kazutaka commit e5b689c8f4fd74e5b7fd4209943050caa69ceb2f Author: MORITA Kazutaka Date: Tue Jan 26 14:33:02 2010 +0900 collie: avoid using an invalid address Some distributions contain `127.0.1.1' in /etc/hosts. We avoid using these kind of invalid addresses. Signed-off-by: MORITA Kazutaka commit 49bde752cc4326bf0cc7f00347158f3fb6508b41 Author: FUJITA Tomonori Date: Tue Jan 26 13:34:56 2010 +0900 collie: fix cow read Signed-off-by: FUJITA Tomonori commit 3b07e7b9a6c201b77acd8b45d8309ce606e96d0d Author: FUJITA Tomonori Date: Tue Jan 26 13:34:55 2010 +0900 collie: forward READ_VDIS Signed-off-by: FUJITA Tomonori commit 6d8e1900ef185441fcfacb7e79605d0e5b09fc0f Author: FUJITA Tomonori Date: Tue Jan 26 13:34:54 2010 +0900 collie: allow qemu not to set epoch field qemu always sends requests to a local dog so and he forwards them. So no need to require qemu to set epoch field. Signed-off-by: FUJITA Tomonori commit fa22cc610d0a4e37e21759becdc928016f783f88 Author: FUJITA Tomonori Date: Tue Jan 26 13:34:53 2010 +0900 remove unused epoch array in struct sheepdog_inode Signed-off-by: FUJITA Tomonori commit f3fb4979b450f37d7b2ef415d37b050cc7a66a9d Author: FUJITA Tomonori Date: Tue Jan 26 13:34:52 2010 +0900 collie: fix VDI id lookup needs to return SD_RES_NO_VDI if the snapshot id is not found. Signed-off-by: FUJITA Tomonori commit c177e6b8ff79e42c1edf2e0bf5e194cb118f6a97 Author: FUJITA Tomonori Date: Tue Jan 26 13:34:51 2010 +0900 collie: support snapshot tag strig fujita@rose:~/git/qemu-kvm$ qemu-img snapshot -c test3 sheepdog:linux fujita@rose:~/git/qemu-kvm$ qemu-img snapshot -l sheepdog:linux Snapshot list: ID TAG VM SIZE DATE VMCLOCK 1 test1 0 2010-01-19 00:09:39 00:00:00.000 2 test2 0 2010-01-19 00:10:34 00:00:00.000 3 test3 0 2010-01-19 00:10:43 00:00:00.000 Signed-off-by: FUJITA Tomonori commit f36d611c9ca2b92a0126f30ba16ca0f58528adf9 Author: FUJITA Tomonori Date: Tue Jan 26 13:34:50 2010 +0900 collie: sort snapshots in SD_OP_SO_READ_VDIS response sort SD_OP_SO_READ_VDIS response in snapshot id order as qcow2 does. Signed-off-by: FUJITA Tomonori commit d2f90af1ccd3730acc4dc7b0f4906aa75cf04af3 Author: FUJITA Tomonori Date: Tue Jan 26 13:34:49 2010 +0900 add snapshot id and tag support to sheepdog_vdi_info struct this is a preparation for snapshot id and tag support. Signed-off-by: FUJITA Tomonori commit 42529e3176cd2b66eba4ca184f7d637890e5e3e8 Author: FUJITA Tomonori Date: Tue Jan 26 13:34:48 2010 +0900 collie: add snapshot id support qemu (qcow2) supports the id of snapshots, which can be used to specify a snapshot. the ids are automatically generated when creating a snapshot. This patch adds the above support to sheepdog. Signed-off-by: FUJITA Tomonori commit 31c615866fd9c5aeb068187c80cdc832696e875e Author: FUJITA Tomonori Date: Tue Jan 26 13:34:47 2010 +0900 add SD_FLAG_CMD_SNAPSHOT to specify the creation of snapshots SD_OP_NEW_VDI uses the tag value to specify the creation of a normal vdi or snapshot. We'll change the tag to a string to obey the qemu logic. This introduces SD_FLAG_CMD_SNAPSHOT to specify the creation of snapshots. Signed-off-by: FUJITA Tomonori commit 9ddc1935321ba168ed6eb8976cfd14465b9cb205 Author: FUJITA Tomonori Date: Fri Jan 15 17:53:04 2010 +0900 collie: avoid forwarding requests locally Instead of avoiding forwarding requests locally, let's perform local file operations directly. Signed-off-by: FUJITA Tomonori commit 0007280ed6394a89bc54b18617dafb6135c2c5c4 Author: FUJITA Tomonori Date: Thu Jan 14 16:54:50 2010 +0900 collie: fix some fd leaks Signed-off-by: FUJITA Tomonori commit 90572523a998b1d7bc7e3429e07f9250f0a9bf87 Author: FUJITA Tomonori Date: Mon Jan 18 15:26:22 2010 +0900 collie: add recovery support This enables nodes to try to recover objects after epoch increase. Recover during node increase is disable for now since it's broken. There are still lots of TODOs. We can't even handle double failure. Signed-off-by: FUJITA Tomonori commit 74fe8ed70bfabc2ed4c7166999866b15da338c6c Author: FUJITA Tomonori Date: Wed Jan 13 13:41:02 2010 +0900 collie: add OP_GET_OBJ_LIST support OP_GET_OBJ_LIST enables nodes to get the list of objects from another node. It's necessary for recovery. Signed-off-by: FUJITA Tomonori commit d3e1e37c3e87e2e89dec7e1dc1f63b4266fcd1bd Author: FUJITA Tomonori Date: Thu Jan 14 11:51:16 2010 +0900 collie: print node id as a simple 64bit value Now we use 64bit for node id. So let's print it simply. Signed-off-by: FUJITA Tomonori commit 1843408a7fd3e15dfe6cc8afe867498408137c27 Author: FUJITA Tomonori Date: Wed Jan 13 12:56:32 2010 +0900 collie: simplify serialization execution Using multiple queues leads to pthread locking/waiting complexity. Instead, this implements a simple serialization execution inside a single queue. Setting work->attr to WORK_ORDERED guarantees that the work is serialized; all the previous works finished, no new works are performed until the task finish. Signed-off-by: FUJITA Tomonori commit 1f77aec6b0fdbb57f1cd37ab9ac7dd6263bce26a Author: MORITA Kazutaka Date: Fri Jan 8 02:00:43 2010 +0900 collie: set an appropreate error value when failed in vdi manipulation Signed-off-by: MORITA Kazutaka commit e6d9a66d338199ad9674a5dac247bebbc41b30c2 Author: FUJITA Tomonori Date: Thu Jan 7 20:33:32 2010 +0900 collie: add some error handling to request forwarding support - we fail if one of writes fails. - we fail if all the reads fail. we need to handle other errors such as epoch mismatch. Signed-off-by: FUJITA Tomonori commit 8833d45640ab0bbcad1e0630f92d114e9d0bc3ba Author: FUJITA Tomonori Date: Thu Jan 7 20:08:37 2010 +0900 collie: create hard link of objects after updating epoch We need to create hard link of objects after updating epoch. We do during adding nodes but somehow we don't during removing nodes. Signed-off-by: FUJITA Tomonori commit 2f96660a214687caee14b3d8a92c829323aace3b Author: FUJITA Tomonori Date: Thu Jan 7 16:33:48 2010 +0900 collie: block object operations during removing nodes We already block object operations during adding nodes but somehow we forget to do removing nodes. Signed-off-by: FUJITA Tomonori commit 673fa1b17dba33e3e92ec9a4614756fe6ea0bebd Author: FUJITA Tomonori Date: Thu Jan 7 13:05:17 2010 +0900 collie: fix __sd_confch() bug What we want to do here is if we find us in joined_list then we send a JOIN request. Signed-off-by: FUJITA Tomonori commit 219471f6d70435bd0783c1cd575a1bb67981abe7 Author: FUJITA Tomonori Date: Thu Jan 7 12:58:57 2010 +0900 collie: store two epoch log entries at least We need two entries at least to recover objects. Signed-off-by: FUJITA Tomonori commit 257e91ac86eceea58995c375c26c5fc965548420 Author: FUJITA Tomonori Date: Wed Jan 6 18:46:54 2010 +0900 collie: create hard link of objects after updating epoch We create hard link of object that we are responsible for after updating epoch. Signed-off-by: FUJITA Tomonori commit 097cf710be677f947a4a6603d3ba315f267f71b9 Author: FUJITA Tomonori Date: Wed Jan 6 17:31:55 2010 +0900 collie: store the relationship objects and epoch We store objects under obj/"epoch"/"object id". We need to store the relationship objects and epoch to recover object properly. Signed-off-by: FUJITA Tomonori commit 8f37ad2675750a7d0841fbbd94037510c2a06f24 Author: FUJITA Tomonori Date: Wed Jan 6 17:17:50 2010 +0900 collie: make sure no outstanding requests during updating epoch Signed-off-by: FUJITA Tomonori commit 0597678019d4762f1f51aa227fc4685a02d20c2b Author: FUJITA Tomonori Date: Wed Jan 6 16:29:52 2010 +0900 collie: postpone performing requests until setting up ci->epoch We can't performing requests properly until setting up ci->epoch. Signed-off-by: FUJITA Tomonori commit b6c06790a8acf475ee9a7d50f266653216816521 Author: FUJITA Tomonori Date: Wed Jan 6 16:03:31 2010 +0900 collie: rename init/exit_worker to init/exit_work_queue Use more appropriate names. Signed-off-by: FUJITA Tomonori commit 6fdaedf1066d2b132ad1c51444c1ef3c1ecb92fc Author: FUJITA Tomonori Date: Wed Jan 6 15:40:49 2010 +0900 collie: add epoch logging support store sheepdog_node_list_entry under /epoch/ to recover objects properly. Signed-off-by: FUJITA Tomonori commit 20c93dfc6a00192912389bcb4a5c00cbba57c013 Author: FUJITA Tomonori Date: Wed Jan 6 11:49:29 2010 +0900 collie: serialize sd_confch operations We need to serialize sd_confch operations so use single thread for it. Signed-off-by: FUJITA Tomonori commit eab6e139b800712d87d5468ca404c7c24d64ee73 Author: FUJITA Tomonori Date: Thu Jan 7 19:50:27 2010 +0900 collie: extend worker framework to handle multiple work queues Signed-off-by: FUJITA Tomonori commit cc48fe0fb1e1b7bc8e8dc82462fc8a73532c3f76 Author: FUJITA Tomonori Date: Tue Jan 5 20:10:11 2010 +0900 collie: remove the duplicated same mode_t for objects and directories define the default mode_t for objects and directories. Signed-off-by: FUJITA Tomonori commit 661a1da85fe40637136cb98f16c607abcb5b5861 Author: FUJITA Tomonori Date: Thu Jan 7 19:47:17 2010 +0900 collie: clean up path initialization Signed-off-by: FUJITA Tomonori commit 5a25bc315d25ab0e1f7b4861672f50a37b457b26 Author: FUJITA Tomonori Date: Sat Jan 2 13:29:00 2010 +0900 collie: add request forwarding hack set copies since qemu doesn't set yet. We'll remove this hack later. Signed-off-by: FUJITA Tomonori commit 27c11c99da620e2a677482e4d60ab6b5a007a3db Author: FUJITA Tomonori Date: Sat Jan 2 11:29:32 2010 +0900 collie: convert write_object() to use request forwarding Set copies field in a header. Signed-off-by: FUJITA Tomonori commit fe429f51f300c0b88b59085975fe2c7e40350e5f Author: FUJITA Tomonori Date: Sat Jan 2 01:00:10 2010 +0900 collie: remove unused SD_OP_UPDATE_EPOCH Signed-off-by: FUJITA Tomonori commit 4416ccc5c2f9ec49bb575f73f5731c8c6cf15938 Author: FUJITA Tomonori Date: Fri Jan 1 23:44:58 2010 +0900 collie: fix some leaks Signed-off-by: FUJITA Tomonori commit 5dfe0bc961fb4ee8f1e69845ac83d57550911dae Author: FUJITA Tomonori Date: Fri Jan 1 22:13:18 2010 +0900 collie: don't use SD_OP_SO_STAT for copies use ci->copies instead. Signed-off-by: FUJITA Tomonori commit 4fe31db7be973d19cb8c2afe7a661967cdac1151 Author: FUJITA Tomonori Date: Thu Jan 7 19:39:35 2010 +0900 collie: keep the default number of replication in cluster_info Let the nodes know the default number of replication to simplify the code. TODO: proper recovery after shutdown. Signed-off-by: FUJITA Tomonori commit e1cfcc655fe4beeaa9d3ee2304496fb0108aba7c Author: FUJITA Tomonori Date: Fri Jan 1 21:09:03 2010 +0900 collie: define VDI_PATH Avoid spreading tons of "/vdi" over the code. Signed-off-by: FUJITA Tomonori commit 2583e801bb4fec894543b5c68e5587b74446a0d8 Author: FUJITA Tomonori Date: Fri Jan 1 18:29:14 2010 +0900 collie: add request forwarding support TODO: error handling Signed-off-by: FUJITA Tomonori commit 78c6a86cc13456f41a4146db7baa33def293aa0e Author: FUJITA Tomonori Date: Thu Dec 31 18:36:55 2009 +0900 collie: move 'open' out of store_queue_request We need to move all the local operations out of store_queue_request to support request forwarding. Signed-off-by: FUJITA Tomonori commit d0d05516d17c9d7158b6dcfcd63910be2e64899e Author: MORITA Kazutaka Date: Tue Dec 29 07:05:34 2009 +0900 use ANAME_COPIES as a number of replication Signed-off-by: MORITA Kazutaka commit a2d006ba17f0f6beaebf40832f6cae05108e8b2f Author: FUJITA Tomonori Date: Tue Dec 22 16:36:24 2009 +0900 fix the attribute name typo Signed-off-by: FUJITA Tomonori commit e1838d1b8aa261500d80e46193f1f389352330e0 Author: FUJITA Tomonori Date: Tue Dec 22 16:36:23 2009 +0900 check the return value of opendir Signed-off-by: FUJITA Tomonori commit 74a22d85a5fed9120ea4d33aab36bd7d0ebdf4a2 Author: FUJITA Tomonori Date: Tue Dec 22 16:36:22 2009 +0900 fix exec_reqs() for multiple nodes exec_reqs() uses hdr for responses so it breaks hdr. TODO: we really need to clean up and merge write_object, read_object, exec_reqs. Signed-off-by: FUJITA Tomonori commit 389552cc8cddde379714a364886f9b4f52493175 Author: MORITA Kazutaka Date: Mon Dec 21 18:43:35 2009 +0900 change the hash function from SHA1 (160 bit) to FNV-1a (64 bit) Signed-off-by: MORITA Kazutaka commit 8352d17e38b3b0203df3f7e9e76943e2276767cd Author: FUJITA Tomonori Date: Mon Dec 21 17:04:17 2009 +0900 add SD_OP_SO_READ_VDIS It is necessary to parse vdis (shepherd). TODO: handling many vdis. Signed-off-by: FUJITA Tomonori commit 041c81b5990728eb2ecdce76db209e9a6bd5bc04 Author: FUJITA Tomonori Date: Mon Dec 21 17:04:16 2009 +0900 store vdi as directories Here's an exmaple: ./linux ./linux/0000000000080000-4b2f0541 ./linux/0000000000040000-00000000 ./linux2 ./linux2/00000000000c0000-00000000 'linux' and 'linux2' are vdi names. 'linux' vdi has one snapshot. '0000000000080000-4b2f0541' is that the oid of the vdi object and its tag. Signed-off-by: FUJITA Tomonori commit 3207452153fea237d0bb8523b2fedc27f5e143fe Author: FUJITA Tomonori Date: Mon Dec 21 17:04:15 2009 +0900 convert super object to directory This converts the super object to a directory (from a file). Signed-off-by: FUJITA Tomonori commit 979be4c0821882509323c59085ca316b6acada4e Author: FUJITA Tomonori Date: Mon Dec 21 17:04:14 2009 +0900 move check_epoch() out of store_queue_request Signed-off-by: FUJITA Tomonori commit bb6dea72bfc8d89bac91d1d55a8af9ca08d49391 Author: FUJITA Tomonori Date: Wed Dec 16 08:52:45 2009 +0900 remove all the autoconf stuff Just simple makefiles should be enough. Signed-off-by: FUJITA Tomonori commit 38960db453be92ba2b95b39bcda7c9184b1e48e4 Author: MORITA Kazutaka Date: Mon Dec 14 03:20:35 2009 +0900 removed unused codes sheep and dog codes are no longer used Signed-off-by: MORITA Kazutaka commit 93eae9eb99b6d389a640eb0f748fd3c50adac9e5 Author: MORITA Kazutaka Date: Wed Dec 2 03:35:23 2009 +0900 add scripts to use collie start-sheepdog is useful when you want to run some collies in one machine. For example, the following command starts eight collie processes in a local machine. $ ./script/start-sheepdog -n=8 When you want to stop these processes, run the following command. $ ./script/stop-sheepdog [0-7] Signed-off-by: MORITA Kazutaka commit cfee94f1748ee9544ac4220a4994d6d6fc400ba3 Author: MORITA Kazutaka Date: Wed Dec 2 03:35:22 2009 +0900 modify Makefile.in and configure.in to support collie To use collie, please install corosync. A Java dog daemon is no longer supported by this script. Signed-off-by: MORITA Kazutaka commit 959e8d5ab8e96e10ad76a878624b417d5911d0e8 Author: MORITA Kazutaka Date: Wed Dec 2 03:35:21 2009 +0900 merge dog port and sheep port We have only one sheepdog daemon now, so we don't need more than one port. Signed-off-by: MORITA Kazutaka commit 80eecd3789b4771232e73c73236fef97265eba8a Author: MORITA Kazutaka Date: Wed Dec 2 03:35:20 2009 +0900 collie: add disk I/O manager This is originally a part of sheep. Currently, only btrfs is supported as a local file system, but we think of removing this restriction. Data recovery will be also supported soon. Signed-off-by: MORITA Kazutaka commit 65c1d524e9d4cecbab2638dcc2516e5f8460646c Author: MORITA Kazutaka Date: Wed Dec 2 03:35:19 2009 +0900 collie: add cluster manager This is originally a part of dog (puppy). Cluster communication and VDI manipulation are supported. Signed-off-by: MORITA Kazutaka commit 1de4a13536ecff8eb40e7661b8d6694bff4a0ad5 Author: MORITA Kazutaka Date: Wed Dec 2 03:35:18 2009 +0900 collie: core codes of a sheepdog daemon This includes a main function, connections handling, and worker threads. Signed-off-by: MORITA Kazutaka commit 7bd54605341352e4b105686c56d8794928d8bd6e Author: MORITA Kazutaka Date: Tue Nov 17 03:03:03 2009 +0900 add sheepdog repository Signed-off-by: MORITA Kazutaka debian/source/0000755000000000000000000000000012243274447010477 5ustar debian/source/format0000644000000000000000000000001412243274472011703 0ustar 3.0 (quilt) debian/po/0000755000000000000000000000000012243274447007615 5ustar debian/po/zh_CN.po0000644000000000000000000000530612243274447011162 0ustar # Chinese translations for sheepdog package # sheepdog 软件包的简体中文翻译. # Copyright (C) 2013 THE sheepdog'S COPYRIGHT HOLDER # This file is distributed under the same license as the sheepdog package. # syq , 2013. # YunQiang Su , 2013. # msgid "" msgstr "" "Project-Id-Version: sheepdog\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2013-02-08 11:06+0800\n" "PO-Revision-Date: 2013-02-08 12:10+0800\n" "Last-Translator: YunQiang Su \n" "Language-Team: Chinese (simplified) \n" "Language: zh_CN\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bits\n" "Plural-Forms: nplurals=1; plural=0;\n" "X-Generator: Gtranslator 2.91.5\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "是否自动启动 sheepdog 服务?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "选择系统启动时是否自动启动 sheepdog 服务。" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "sheepdog 进程的参数:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "请选择传递给 sheepdog 守护进程的命令行参数。如果没有给定参数,默认行为是监听 " "7000 端口,使用的集群启动为 corosync。" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "可用的选项包括:\n" " -p, --port 指定要监听的 TCP 端口\n" " -l, --loglevel 指定日志详细程度\n" " -d, --debug 在日志中包含调试信息\n" " -D, --directio 访问存储对象时,使用直接 I/O\n" " -z, --zone 制定 zone ID\n" " -c, --cluster 指定集群驱动\n" "更多信息请查阅 sheep(8) 手册页。" debian/po/templates.pot0000644000000000000000000000347312243274447012346 0ustar # SOME DESCRIPTIVE TITLE. # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER # This file is distributed under the same license as the PACKAGE package. # FIRST AUTHOR , YEAR. # #, fuzzy msgid "" msgstr "" "Project-Id-Version: sheepdog\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2013-02-08 11:06+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" "Language: \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=CHARSET\n" "Content-Transfer-Encoding: 8bit\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" debian/po/sv.po0000644000000000000000000000535712243274447010617 0ustar # SOME DESCRIPTIVE TITLE. # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER # This file is distributed under the same license as the PACKAGE package. # FIRST AUTHOR , YEAR. # msgid "" msgstr "" "Project-Id-Version: sheepdog\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2012-12-19 08:23+0100\n" "PO-Revision-Date: 2013-01-03 17:16+0100\n" "Last-Translator: Martin Bagge / brother \n" "Language-Team: Swedish \n" "Language: \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "X-Poedit-Language: Swedish\n" "X-Poedit-Country: Sweden\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "Ska sheepdog-tjänsten startas automatiskt?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "Ange om tjänsten sheepdog ska startas automatiskt vid systemets uppstart." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "Argument till sheepdog-tjänsten:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "Ange de kommandoradsargument som ska skickas till sheepdog-tjänsten. Om inga " "argument anges kommer standardbeteendet att användas, nämligen att " "drivrutinen corosync används och tjänsten använder port 7000." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "Tillgängliga alternativ inkluderar:\n" " -p, --port ange vilken TCP port som ska lyssnas på\n" " -l, --loglevel ange detaljnivån för loggningen\n" " -d, --debug inkludera felsökningsmeddelanden i loggen\n" " -D, --directio använd direkt I/O vid kommunikation med " "objektlagret\n" " -z, --zone ange zone ID\n" " -c, --cluster ange klusterdrivrutin\n" "Mer information kan hittas i manualsidan sheep(8)." debian/po/ru.po0000644000000000000000000000642112243274447010606 0ustar # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER # This file is distributed under the same license as the sheepdog package. # # Yuri Kozlov , 2013. msgid "" msgstr "" "Project-Id-Version: sheepdog 0.5.4-2\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2012-12-19 08:23+0100\n" "PO-Revision-Date: 2013-01-02 13:49+0400\n" "Last-Translator: Yuri Kozlov \n" "Language-Team: Russian \n" "Language: ru\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=3; plural=(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<" "=4 && (n%100<10 || n%100>=20) ? 1 : 2);\n" "X-Generator: Lokalize 1.4\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "Запускать службу sheepdog автоматически?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "Укажите, должна ли служба sheepdog запускаться автоматически при " "загрузке системы." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "Параметры службы sheepdog:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "Введите параметры командной строки, которые нужно передать службе " "sheepdog. Если параметры не указывать, то по умолчанию используется порт " "7000 и драйвер corosync." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "Возможные параметры:\n" " -p, --port прослушиваемый порт TCP\n" " -l, --loglevel степень детализации протоколирования\n" " -d, --debug добавлять сообщения отладки в журнал\n" " -D, --directio использовать непосредственный ввод-вывод\n" " при доступе к хранилищу объектов\n" " -z, --zone идентификатор зоны\n" " -c, --cluster драйвер кластера\n" "Подробная информация о параметрах приведена в справочной странице sheep(8)." debian/po/pt.po0000644000000000000000000000535112243274447010604 0ustar # sheepdog debconf portuguese messages # Copyright (C) 2012 THE PACKAGE'S COPYRIGHT HOLDER # This file is distributed under the same license as the sheepdog package. # Pedro Ribeiro , 2012 # msgid "" msgstr "" "Project-Id-Version: sheepdog\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2013-02-08 11:06+0800\n" "PO-Revision-Date: 2012-12-23 23:55+0000\n" "Last-Translator: Pedro Ribeiro \n" "Language-Team: Portuguese \n" "Language: pt\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "Iniciar o serviço sheepdog automaticamente?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "Indique por favor se o serviço sheepdog deve ser iniciado automaticamente ao " "iniciar o sistema." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "Argumentos para o daemon sheepdog:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "Indique por favor os argumentos da linha de comandos que devem ser passados " "para o daemon sheepdog. Se não forem indicados argumentos, o comportamento " "predefinido é iniciar no porto 7000 e usar o driver corosync." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "As opções disponíveis incluem:\n" " -p, --port indique o porto TCP no qual escutar\n" " -l, --loglevel indique o nível de detalhe dos registos\n" " -d, --debug incluir mensagem de debug no registo\n" " -D, --directio usar I/O directo no acesso aos objectos\n" " -z, --zone indicar o ID de zona\n" " -c, --cluster indicar o driver do cluster\n" "Mais informação pode ser encontrada na página do manual sheep(8)." debian/po/pl.po0000644000000000000000000000561612243274447010600 0ustar # Translation of sheepdog debconf templates to Polish. # Copyright (C) 2012 # This file is distributed under the same license as the sheepdog package. # # Michał Kułach , 2012, 2013. msgid "" msgstr "" "Project-Id-Version: sheepdog\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2012-12-19 08:23+0100\n" "PO-Revision-Date: 2013-01-02 23:59+0100\n" "Last-Translator: Michał Kułach \n" "Language-Team: Polish \n" "Language: pl\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=3; plural=(n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 " "|| n%100>=20) ? 1 : 2);\n" "X-Generator: Lokalize 1.4\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "Uruchamiać usługę sheepdog automatycznie?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "Proszę wybrać, czy usługa sheepdog ma być uruchamiana automatycznie, w " "trakcie rozruchu systemu." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "Argumenty do demona sheepdog:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "Proszę wprowadzić argumenty wiersza polecenia, które zostaną przekazane " "demonowi sheepdog. Jeśli nie poda się żadnych, to uruchomi się on na porcie " "7000, korzystając ze sterownika corosync." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "Dostępne są między innymi:\n" " -p, --port określa port TCP do nasłuchu\n" " -l, --loglevel określa poziom detali dziennika\n" " -d, --debug dołącza wiadomości debugowania do dziennika\n" " -D, --directio używa bezpośredniego wejścia/wyjścia\n" " przy dostępie do przech. obiektu\n" " -z, --zone określa identyfikator strefy\n" " -c, --cluster określa sterownik klastra\n" "Więcej informacji zawiera strona podręcznika sheep(8)." debian/po/ja.po0000644000000000000000000000556212243274447010557 0ustar # SOME DESCRIPTIVE TITLE. # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER # This file is distributed under the same license as the PACKAGE package. # victory , 2012. # msgid "" msgstr "" "Project-Id-Version: sheepdog\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2012-12-19 08:23+0100\n" "PO-Revision-Date: 2012-12-19 16:23+0900\n" "Last-Translator: victory \n" "Language-Team: Japanese \n" "Language: ja\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "sheepdog サービスを自動的に開始しますか?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "システムの起動時に sheepdog サービスを自動的に開始するかどうか決めてください。" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "sheepdog デーモンに渡す引数:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "sheepdog デーモンに渡すコマンドライン引数を選択してください。引数を与えない場" "合のデフォルトの挙動は、ポート 7000 で corosync ドライバを利用して開始します。" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "利用可能なオプションには以下のようなものがあります:\n" " -p, --port 待ち受ける TCP ポートを指定します\n" " -l, --loglevel どの程度詳細にログに記録するかを指定します\n" " -d, --debug ログにデバッグ用メッセージを記録します\n" " -D, --directio オブジェクトストアへのアクセスにダイレクト I/O を使います\n" " -z, --zone ゾーン ID を指定します\n" " -c, --cluster クラスタドライバを指定します\n" "sheep(8) マニュアルページにさらなる情報があります。" debian/po/it.po0000644000000000000000000000562412243274447010600 0ustar # Italian translation of sheepdog debconf messages. # Copyright (C) 2012, Debian Italian l10n team # This file is distributed under the same license as the sheepdog package. # Beatrice Torracca , 2012, 2013. msgid "" msgstr "" "Project-Id-Version: sheepdog\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2012-12-19 08:23+0100\n" "PO-Revision-Date: 2013-01-01 11:13+0200\n" "Last-Translator: Beatrice Torracca \n" "Language-Team: Italian \n" "Language: it\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=2; plural=(n != 1);\n" "X-Generator: Virtaal 0.7.1\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "Avviare il servizio sheepdog automaticamente?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "Scegliere se il servizio sheepdog deve essere fatto partire automaticamente " "all'avvio del sistema." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "Argomenti per il demone sheepdog:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "Scegliere gli argomenti per la riga di comando che devono essere passati al " "demone sheepdog. Se non viene specificato alcun argomento, il comportamento " "predefinito è di avviarlo sulla porta 7000 usando il driver corosync." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "Le opzioni disponibili includono:\n" " -p, --port specifica la porta TCP su cui rimanere in ascolto\n" " -l, --loglevel specifica il livello di dettaglio del registro\n" " -d, --debug include i messaggi di debug nel registro\n" " -D, --directio usa l'I/O diretto quando accede all'archivio degli " "oggetti\n" " -z, --zone specifica l'ID di zona\n" " -c, --cluster specifica il driver per il cluster\n" "Ulteriori informazioni si possono trovare nella pagina di manuale sheep(8)." debian/po/gl.po0000644000000000000000000000551012243274447010560 0ustar # Galician translations for sheepdog package. # Copyright (C) 2012 THE sheepdog'S COPYRIGHT HOLDER # This file is distributed under the same license as the sheepdog package. # # Jorge Barreiro , 2012. msgid "" msgstr "" "Project-Id-Version: sheepdog\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2012-12-19 08:23+0100\n" "PO-Revision-Date: 2012-12-30 19:01+0100\n" "Last-Translator: Jorge Barreiro \n" "Language-Team: Galician \n" "Language: gl\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "X-Generator: Lokalize 1.0\n" "Plural-Forms: nplurals=2; plural=n != 1;\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "Iniciar o servizo «sheepog» automaticamente?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "Escolla se o servizo «sheepdog» debe iniciarse automaticamente cando " "arranque o sistema." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "Parámetros para o servizo «sheepdog»:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "Escolla os parámetros da liña de orde que se pasarán ao servizo «sheepdog»." " " "Se non indica ningún parámetro, o comportamento por omisión é iniciarse no " "porto 7000, usando o controlador «corosync»." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "As opcións dispoñíbeis inclúen:\n" " -p, --port indica o porto TCP no que escoitar\n" " -l, --loglevel indica o nivel de detalle do rexistro\n" " -d, --debug incluír mensaxes de depuración no rexistro\n" " -D, --directio usar I/O directo ao acceder ao almacén de " "obxectos\n" " -z, --zone indica o ID de zona\n" " -c, --cluster indica o controlador do clúster\n" "Pode atopar máis información na páxina do manual de sheep(8)." debian/po/fr.po0000644000000000000000000000574512243274447010577 0ustar # Translation of sheepdog debconf templates to french. # Copyright (C) 2012, Debian l10n French team # This file is distributed under the same license as the SHEEPDOG package. # Julien Patriarca , 2012. # msgid "" msgstr "" "Project-Id-Version: sheepdog\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2012-12-19 08:23+0100\n" "PO-Revision-Date: 2012-12-19 14:03+0100\n" "Last-Translator: Julien Patriarca \n" "Language-Team: FRENCH \n" "Language: \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "Faut-il démarrer automatiquement le service sheepdog ?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "Veuillez choisir si le service sheepdog doit démarrer automatiquement au " "lancement du système." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "Paramètres pour le démon sheepdog :" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "Veuillez sélectionner les paramètres de ligne de commande qui doivent être " "passés au démon sheepdog. Si aucun paramètre n'est donné, le comportement par " "défaut est de démarrer sur le port 7000, en utilisant le pilote corosync." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "Les options disponibles sont :\n" " -p, --port indique le port TCP sur lequel écouter ;\n" " -l, --loglevel indique le niveau de détails de la journalisation ;\n" " -d, --debug inclut les messages de débogage dans les journaux\n" " du système ;\n" " -D, --directio utilise des entrées/sorties directes lors de l'accès\n" " à l'objet stockage ;\n" " -z, --zone indique l'identifiant de zone ;\n" " -c, --cluster indique le pilote de groupe (« cluster »).\n" "Vous pouvez trouver plus d'informations dans la page de manuel de sheep(8)." debian/po/es.po0000644000000000000000000000707412243274447010574 0ustar # sheepdog po-debconf translation to Spanish # Copyright (C) 2010 Software in the Public Interest # This file is distributed under the same license as the sheepdog package. # # Changes: # - Initial translation # Camaleón , 2012 # # - Updates # # # Traductores, si no conocen el formato PO, merece la pena leer la # documentación de gettext, especialmente las secciones dedicadas a este # formato, por ejemplo ejecutando: # info -n '(gettext)PO Files' # info -n '(gettext)Header Entry' # # Equipo de traducción al español, por favor lean antes de traducir # los siguientes documentos: # # - El proyecto de traducción de Debian al español # http://www.debian.org/intl/spanish/ # especialmente las notas y normas de traducción en # http://www.debian.org/intl/spanish/notas # # - La guía de traducción de po's de debconf: # /usr/share/doc/po-debconf/README-trans # o http://www.debian.org/intl/l10n/po-debconf/README-trans # msgid "" msgstr "" "Project-Id-Version: sheepdog 0.5.4-1\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2013-02-08 11:06+0800\n" "PO-Revision-Date: 2012-12-28 19:20+0100\n" "Last-Translator: Camaleón \n" "Language-Team: Debian Spanish \n" "Language: \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "¿Desea que el servicio sheepdog se inicie automáticamente?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "Elija si el servicio sheepdog se debe iniciar automáticamente al arrancar el " "sistema." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "Opciones para el demonio sheepdog:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "Seleccione las opciones para el inicio en línea de órdenes del demonio " "sheepdog. Si no proporciona ninguna opción, se utilizará el comportamiento " "predeterminado (el servicio se iniciará en el puerto 7000 y utilizará el " "controlador corosync)." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "Opciones disponibles:\n" " -p, --port especificar el puerto TCP de escucha\n" " -l, --loglevel especificar el nivel de detalle del registro\n" " -d, --debug incluir los mensajes de depuración en el registro\n" " -D, --directio utilizar E/S directa para acceder al almacén de " "objetos\n" " -z, --zone especificar el identificador (ID) de la zona\n" " -c, --cluster especificar el controlador del cluster\n" "Puede obtener más información en la página del manual de sheep(8)." debian/po/de.po0000644000000000000000000000561412243274447010553 0ustar # German translation of sheepdog. # Copyright (C) 2010 Guido Günther. # This file is distributed under the same license as the sheepdog package. # Translation by Chris Leick 2012. # msgid "" msgstr "" "Project-Id-Version: sheepdog 0.5.4-2\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2012-12-19 08:23+0100\n" "PO-Revision-Date: 2012-12-22 15:49+0100\n" "Last-Translator: Chris Leick \n" "Language-Team: German \n" "Language: de\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" "Plural-Forms: nplurals=2; plural=n != 1;\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "Sheepdog-Dienst automatisch starten?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "Bitte wählen Sie, ob der Sheepdog-Dienst automatisch beim Hochfahren des " "Systems gestartet werden soll." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "Argumente für den Sheepdog-Daemon:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "Bitte wählen Sie die Befehlszeilenargumente, die an den Sheepdog-Daemon " "übergeben werden sollen. Falls kein Argument angegeben wird, startet er " "standardmäßig auf Port 7000 und verwendet den Corosync-Treiber." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "Die verfügbaren Optionen umfassen:\n" " -p, --port gibt den TCP-Port an, auf dem auf eine Verbindung\n" " gewartet wird\n" " -l, --loglevel gibt die Detailstufe der Protokollierung an\n" " -d, --debug lässt Debug-Meldungen in das Protokoll einfließen\n" " -D, --directio verwendet beim Zugriff auf den Objektspeicher\n" " direkte E/A\n" " -z, --zone gibt die Zonenkennung an\n" " -c, --cluster gibt den Cluster-Treiber an\n" "Weitere Informationen finden Sie in der Handbuchseite sheep(8)." debian/po/da.po0000644000000000000000000000521612243274447010545 0ustar # Danish translation sheepdog. # Copyright (C) 2012 sheepdog og nedenstående oversættere. # This file is distributed under the same license as the sheepdog package. # Joe Hansen , 2012. # msgid "" msgstr "" "Project-Id-Version: sheepdog\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2012-12-19 08:23+0100\n" "PO-Revision-Date: 2013-01-03 17:30+01:00\n" "Last-Translator: Joe Hansen \n" "Language-Team: Danish \n" "Language: da\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "Start automatisk tjenesten sheepdog?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "Vælg venligst hvorvidt tjenesten sheepdog skal starte automatisk når " "systemet startes op." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "Parametre for dæmonen sheepdog:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "Vælg venligst kommandolinjeparametrene, som skal sendes til dæmonen sheepdog. " "Hvis ingen parametre angives er standarden at starte på port 7000, med driveren " "corosync." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "Tilgængelige tilvalg inkluderer:\n" " -p, --port angiv TCP-porten der skal lyttes på\n" " -l, --loglevel angiv loggens detaljeniveau\n" " -d, --debug inkluder fejlsøgningsbeskeder i loggen\n" " -D, --directio brug direkte I/O når objektlageret tilgås\n" " -z, --zone angiv zone-id\n" " -c, --cluster angiv klyngedriveren\n" "Yderligere information kan findes på manualsiden sheep(8)." debian/po/cs.po0000644000000000000000000000541712243274447010571 0ustar # Czech PO debconf template translation of sheepdog. # Copyright (C) 2012 Michal Simunek # This file is distributed under the same license as the sheepdog package. # Michal Simunek , 2012. # msgid "" msgstr "" "Project-Id-Version: sheepdog 0.5.4-2\n" "Report-Msgid-Bugs-To: sheepdog@packages.debian.org\n" "POT-Creation-Date: 2012-12-19 08:23+0100\n" "PO-Revision-Date: 2012-12-20 10:31+0100\n" "Last-Translator: Michal Simunek \n" "Language-Team: Czech \n" "Language: cs\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=utf-8\n" "Content-Transfer-Encoding: 8bit\n" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "Automatically start the sheepdog service?" msgstr "Spouštět službu sheepdog automaticky?" #. Type: boolean #. Description #: ../sheepdog.templates:2001 msgid "" "Please choose whether the sheepdog service should start automatically when " "the system is booted." msgstr "" "Zvolte si prosím, zda se má služba sheepdog automaticky spouštět při " "zavádění systému." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "Arguments for the sheepdog daemon:" msgstr "Volby pro démona sheepdog:" #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Please choose the command line arguments that should be passed to the " "sheepdog daemon. If no argument is given, the default behavior is to start " "on port 7000, using the corosync driver." msgstr "" "Zadejte prosím volby pro příkazový řádek, které se mají předat " "démonu sheepdog. Nezadáte-li žádnou volbu, sheepdog se bude spouštět " "ve výchozím stavu na portu 7000 a bude používat ovladač corosync." #. Type: string #. Description #: ../sheepdog.templates:3001 msgid "" "Available options include:\n" " -p, --port specify the TCP port to listen to\n" " -l, --loglevel specify the level of logging detail\n" " -d, --debug include debug messages in the log\n" " -D, --directio use direct I/O when accessing the object store\n" " -z, --zone specify the zone ID\n" " -c, --cluster specify the cluster driver\n" "More information can be found in the sheep(8) manual page." msgstr "" "Možné volby zahrnují:\n" " -p, --port určuje TCP port, na kterém se bude naslouchat\n" " -l, --loglevel určuje úroveň zaznamenávaných informací\n" " -d, --debug do záznamu zahrne ladicí zprávy\n" " -D, --directio při přístupu k objektu store použije přímo I/O\n" " -z, --zone určuje ID zóny\n" " -c, --cluster určuje ovladač clusteru\n" "Více informací naleznete v manuálové stránce sheep(8)." debian/po/POTFILES.in0000644000000000000000000000005512243274472011370 0ustar [type: gettext/rfc822deb] sheepdog.templates debian/patches/0000755000000000000000000000000012243356461010623 5ustar debian/patches/define_EFD_SEMAPHORE_ifnone.diff0000644000000000000000000000174012243356461016330 0ustar Index: sheepdog/lib/work.c =================================================================== --- sheepdog.orig/lib/work.c 2013-11-21 18:14:22.111032550 +0800 +++ sheepdog/lib/work.c 2013-11-21 18:14:22.103032566 +0800 @@ -36,6 +36,10 @@ #define TID_MAX_DEFAULT 0x8000 /* default maximum tid for most systems */ +#ifndef EFD_SEMAPHORE +#define EFD_SEMAPHORE 00000001 +#endif + static size_t tid_max; static unsigned long *tid_map; static int resume_efd; Index: sheepdog/sheep/group.c =================================================================== --- sheepdog.orig/sheep/group.c 2013-11-21 11:08:42.000000000 +0800 +++ sheepdog/sheep/group.c 2013-11-21 18:15:00.766951308 +0800 @@ -343,7 +343,7 @@ nodes_len = rsp->data_length - sizeof(timestamp); memcpy((void *)nodes, buf, nodes_len); if (timestamp) - memcpy(timestamp, buf + nodes_len, sizeof(timestamp)); + memcpy(timestamp, buf + nodes_len, sizeof(time_t)); return nodes_len / sizeof(struct sd_node); } debian/patches/subdir-objects.diff0000644000000000000000000000047112243300330014356 0ustar --- a/configure.ac +++ b/configure.ac @@ -26,7 +26,7 @@ AC_INIT([sheepdog], m4_default(git_version, sheepdog_version), [sheepdog@lists.wpkg.org]) -AM_INIT_AUTOMAKE([-Wno-portability]) +AM_INIT_AUTOMAKE([-Wno-portability subdir-objects]) AC_CONFIG_SRCDIR([dog/dog.c]) AC_CONFIG_HEADER([include/config.h]) debian/patches/series0000644000000000000000000000006512243352066012036 0ustar define_EFD_SEMAPHORE_ifnone.diff subdir-objects.diff