blktap-2.0.90/0000755000000000000000000000000011664746062011571 5ustar rootrootblktap-2.0.90/lvm/0000755000000000000000000000000011664745551012371 5ustar rootrootblktap-2.0.90/lvm/lvm-util.h0000644000000000000000000000475611664745551014327 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _LVM_UTIL_H_ #define _LVM_UTIL_H_ #include #define MAX_NAME_SIZE 256 #define LVM_SEG_TYPE_LINEAR 1 #define LVM_SEG_TYPE_UNKNOWN 2 struct lv_segment { uint8_t type; char device[MAX_NAME_SIZE]; uint64_t pe_start; uint64_t pe_size; }; struct lv { char name[MAX_NAME_SIZE]; uint64_t size; uint32_t segments; struct lv_segment first_segment; }; struct pv { char name[MAX_NAME_SIZE]; uint64_t start; }; struct vg { char name[MAX_NAME_SIZE]; uint64_t extent_size; int pv_cnt; struct pv *pvs; int lv_cnt; struct lv *lvs; }; int lvm_scan_vg(const char *vg_name, struct vg *vg); void lvm_free_vg(struct vg *vg); #endif blktap-2.0.90/lvm/lvm-util.c0000644000000000000000000001715511664745551014317 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "lvm-util.h" #define EPRINTF(_f, _a...) \ do { \ syslog(LOG_INFO, "%s: " _f, __func__, ##_a); \ } while (0) #define _NAME "%255s" static char line[1024]; static inline int lvm_read_line(FILE *scan) { memset(line, 0, sizeof(line)); return (fscanf(scan, "%1023[^\n]", line) != 1); } static inline int lvm_next_line(FILE *scan) { return (fscanf(scan, "%1023[\n]", line) != 1); } static int lvm_copy_name(char *dst, const char *src, size_t size) { if (strnlen(src, size) == size) return -ENAMETOOLONG; strcpy(dst, src); return 0; } static int lvm_parse_pv(struct vg *vg, const char *name, int pvs, uint64_t start) { int i, err; struct pv *pv; pv = NULL; if (!vg->pvs) { vg->pvs = calloc(pvs, sizeof(struct pv)); if (!vg->pvs) return -ENOMEM; } for (i = 0; i < pvs; i++) { pv = vg->pvs + i; if (!pv->name[0]) break; if (!strcmp(pv->name, name)) return -EEXIST; } if (!pv) return -ENOENT; if (i == pvs) return -ENOMEM; err = lvm_copy_name(pv->name, name, sizeof(pv->name) - 1); if (err) return err; pv->start = start; return 0; } static int lvm_open_vg(const char *vgname, struct vg *vg) { FILE *scan; int i, err, pvs, lvs; char *cmd, pvname[256]; uint64_t size, pv_start; memset(vg, 0, sizeof(*vg)); err = asprintf(&cmd, "/usr/sbin/vgs %s --noheadings --nosuffix --units=b " "--options=vg_name,vg_extent_size,lv_count,pv_count," "pv_name,pe_start --unbuffered 2> /dev/null", vgname); if (err == -1) return -ENOMEM; errno = 0; scan = popen(cmd, "r"); if (!scan) { err = (errno ? -errno : ENOMEM); goto out; } for (;;) { if (lvm_read_line(scan)) break; err = -EINVAL; if (sscanf(line, _NAME" %"PRIu64" %d %d "_NAME" %"PRIu64, vg->name, &size, &lvs, &pvs, pvname, &pv_start) != 6) { EPRINTF("sscanf failed on '%s'\n", line); goto out; } if (strcmp(vg->name, vgname)) { EPRINTF("VG name '%s' != '%s'\n", vg->name, vgname); goto out; } err = lvm_parse_pv(vg, pvname, pvs, pv_start); if (err) goto out; if (lvm_next_line(scan)) break; } err = -EINVAL; if (strcmp(vg->name, vgname)) { EPRINTF("VG name '%s' != '%s'\n", vg->name, vgname); goto out; } for (i = 0; i < pvs; i++) if (!vg->pvs[i].name[0]) { EPRINTF("pvs %d name empty\n", i); goto out; } err = -ENOMEM; vg->lvs = calloc(lvs, sizeof(struct lv)); if (!vg->lvs) goto out; err = 0; vg->lv_cnt = lvs; vg->pv_cnt = pvs; vg->extent_size = size; out: if (scan) pclose(scan); if (err) lvm_free_vg(vg); free(cmd); return err; } static int lvm_parse_lv_devices(struct vg *vg, struct lv_segment *seg, char *devices) { int i; uint64_t start, pe_start; for (i = 0; i < strlen(devices); i++) if (strchr(",()", devices[i])) devices[i] = ' '; if (sscanf(devices, _NAME" %"PRIu64, seg->device, &start) != 2) { EPRINTF("sscanf failed on '%s'\n", devices); return -EINVAL; } pe_start = -1; for (i = 0; i < vg->pv_cnt; i++) if (!strcmp(vg->pvs[i].name, seg->device)) { pe_start = vg->pvs[i].start; break; } if (pe_start == -1) { EPRINTF("invalid pe_start value\n"); return -EINVAL; } seg->pe_start = (start * vg->extent_size) + pe_start; return 0; } static int lvm_scan_lvs(struct vg *vg) { char *cmd; FILE *scan; int i, err; err = asprintf(&cmd, "/usr/sbin/lvs %s --noheadings --nosuffix --units=b " "--options=lv_name,lv_size,segtype,seg_count,seg_start," "seg_size,devices --unbuffered 2> /dev/null", vg->name); if (err == -1) return -ENOMEM; errno = 0; scan = popen(cmd, "r"); if (!scan) { err = (errno ? -errno : -ENOMEM); goto out; } for (i = 0;;) { int segs; struct lv *lv; struct lv_segment seg; unsigned long long size, seg_start; char type[32], name[256], devices[1024]; if (i >= vg->lv_cnt) break; if (lvm_read_line(scan)) { vg->lv_cnt = i; break; } err = -EINVAL; lv = vg->lvs + i; if (sscanf(line, _NAME" %llu %31s %u %llu %"PRIu64" %1023s", name, &size, type, &segs, &seg_start, &seg.pe_size, devices) != 7) { EPRINTF("sscanf failed on '%s'\n", line); goto out; } if (seg_start) goto next; if (!strcmp(type, "linear")) seg.type = LVM_SEG_TYPE_LINEAR; else seg.type = LVM_SEG_TYPE_UNKNOWN; if (lvm_parse_lv_devices(vg, &seg, devices)) goto out; i++; lv->size = size; lv->segments = segs; lv->first_segment = seg; err = lvm_copy_name(lv->name, name, sizeof(lv->name) - 1); if (err) goto out; err = -EINVAL; next: if (lvm_next_line(scan)) { if (err) EPRINTF("fscanf failed\n"); goto out; } } err = 0; out: if (scan) pclose(scan); free(cmd); return err; } void lvm_free_vg(struct vg *vg) { free(vg->lvs); free(vg->pvs); memset(vg, 0, sizeof(*vg)); } int lvm_scan_vg(const char *vg_name, struct vg *vg) { int err; memset(vg, 0, sizeof(*vg)); err = lvm_open_vg(vg_name, vg); if (err) return err; err = lvm_scan_lvs(vg); if (err) { lvm_free_vg(vg); return err; } return 0; } #ifdef LVM_UTIL static int usage(void) { printf("usage: lvm-util \n"); exit(EINVAL); } int main(int argc, char **argv) { int i, err; struct vg vg; struct pv *pv; struct lv *lv; struct lv_segment *seg; if (argc != 2) usage(); err = lvm_scan_vg(argv[1], &vg); if (err) { printf("scan failed: %d\n", err); return (err >= 0 ? err : -err); } printf("vg %s: extent_size: %"PRIu64", pvs: %d, lvs: %d\n", vg.name, vg.extent_size, vg.pv_cnt, vg.lv_cnt); for (i = 0; i < vg.pv_cnt; i++) { pv = vg.pvs + i; printf("pv %s: start %"PRIu64"\n", pv->name, pv->start); } for (i = 0; i < vg.lv_cnt; i++) { lv = vg.lvs + i; seg = &lv->first_segment; printf("lv %s: size: %"PRIu64", segments: %u, type: %u, " "dev: %s, pe_start: %"PRIu64", pe_size: %"PRIu64"\n", lv->name, lv->size, lv->segments, seg->type, seg->device, seg->pe_start, seg->pe_size); } lvm_free_vg(&vg); return 0; } #endif blktap-2.0.90/lvm/Makefile.am0000644000000000000000000000030611664745551014424 0ustar rootroot AM_CFLAGS = -Wall AM_CFLAGS += -Werror sbin_PROGRAMS = lvm-util lvm_util_SOURCES = lvm-util.c lvm_util_SOURCES += lvm-util.h lvm_util_CPPFLAGS = -D_GNU_SOURCE lvm_util_CPPFLAGS += -DLVM_UTIL blktap-2.0.90/autogen.sh0000755000000000000000000013443211664745551013603 0ustar rootroot#!/bin/sh # a u t o g e n . s h # # Copyright (c) 2005-2009 United States Government as represented by # the U.S. Army Research Laboratory. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # # 3. The name of the author may not be used to endorse or promote # products derived from this software without specific prior written # permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS # OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE # GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ### # # Script for automatically preparing the sources for compilation by # performing the myriad of necessary steps. The script attempts to # detect proper version support, and outputs warnings about particular # systems that have autotool peculiarities. # # Basically, if everything is set up and installed correctly, the # script will validate that minimum versions of the GNU Build System # tools are installed, account for several common configuration # issues, and then simply run autoreconf for you. # # If autoreconf fails, which can happen for many valid configurations, # this script proceeds to run manual preparation steps effectively # providing a POSIX shell script (mostly complete) reimplementation of # autoreconf. # # The AUTORECONF, AUTOCONF, AUTOMAKE, LIBTOOLIZE, ACLOCAL, AUTOHEADER # environment variables and corresponding _OPTIONS variables (e.g. # AUTORECONF_OPTIONS) may be used to override the default automatic # detection behaviors. Similarly the _VERSION variables will override # the minimum required version numbers. # # Examples: # # To obtain help on usage: # ./autogen.sh --help # # To obtain verbose output: # ./autogen.sh --verbose # # To skip autoreconf and prepare manually: # AUTORECONF=false ./autogen.sh # # To verbosely try running with an older (unsupported) autoconf: # AUTOCONF_VERSION=2.50 ./autogen.sh --verbose # # Author: # Christopher Sean Morrison # # Patches: # Sebastian Pipping # ###################################################################### # set to minimum acceptable version of autoconf if [ "x$AUTOCONF_VERSION" = "x" ] ; then AUTOCONF_VERSION=2.52 fi # set to minimum acceptable version of automake if [ "x$AUTOMAKE_VERSION" = "x" ] ; then AUTOMAKE_VERSION=1.6.0 fi # set to minimum acceptable version of libtool if [ "x$LIBTOOL_VERSION" = "x" ] ; then LIBTOOL_VERSION=1.4.2 fi ################## # ident function # ################## ident ( ) { # extract copyright from header __copyright="`grep Copyright $AUTOGEN_SH | head -${HEAD_N}1 | awk '{print $4}'`" if [ "x$__copyright" = "x" ] ; then __copyright="`date +%Y`" fi # extract version from CVS Id string __id="$Id: autogen.sh 33925 2009-03-01 23:27:06Z brlcad $" __version="`echo $__id | sed 's/.*\([0-9][0-9][0-9][0-9]\)[-\/]\([0-9][0-9]\)[-\/]\([0-9][0-9]\).*/\1\2\3/'`" if [ "x$__version" = "x" ] ; then __version="" fi echo "autogen.sh build preparation script by Christopher Sean Morrison" echo " + config.guess download patch by Sebastian Pipping (2008-12-03)" echo "revised 3-clause BSD-style license, copyright (c) $__copyright" echo "script version $__version, ISO/IEC 9945 POSIX shell script" } ################## # USAGE FUNCTION # ################## usage ( ) { echo "Usage: $AUTOGEN_SH [-h|--help] [-v|--verbose] [-q|--quiet] [-d|--download] [--version]" echo " --help Help on $NAME_OF_AUTOGEN usage" echo " --verbose Verbose progress output" echo " --quiet Quiet suppressed progress output" echo " --download Download the latest config.guess from gnulib" echo " --version Only perform GNU Build System version checks" echo echo "Description: This script will validate that minimum versions of the" echo "GNU Build System tools are installed and then run autoreconf for you." echo "Should autoreconf fail, manual preparation steps will be run" echo "potentially accounting for several common preparation issues. The" echo "AUTORECONF, AUTOCONF, AUTOMAKE, LIBTOOLIZE, ACLOCAL, AUTOHEADER," echo "PROJECT, & CONFIGURE environment variables and corresponding _OPTIONS" echo "variables (e.g. AUTORECONF_OPTIONS) may be used to override the" echo "default automatic detection behavior." echo ident return 0 } ########################## # VERSION_ERROR FUNCTION # ########################## version_error ( ) { if [ "x$1" = "x" ] ; then echo "INTERNAL ERROR: version_error was not provided a version" exit 1 fi if [ "x$2" = "x" ] ; then echo "INTERNAL ERROR: version_error was not provided an application name" exit 1 fi $ECHO $ECHO "ERROR: To prepare the ${PROJECT} build system from scratch," $ECHO " at least version $1 of $2 must be installed." $ECHO $ECHO "$NAME_OF_AUTOGEN does not need to be run on the same machine that will" $ECHO "run configure or make. Either the GNU Autotools will need to be installed" $ECHO "or upgraded on this system, or $NAME_OF_AUTOGEN must be run on the source" $ECHO "code on another system and then transferred to here. -- Cheers!" $ECHO } ########################## # VERSION_CHECK FUNCTION # ########################## version_check ( ) { if [ "x$1" = "x" ] ; then echo "INTERNAL ERROR: version_check was not provided a minimum version" exit 1 fi _min="$1" if [ "x$2" = "x" ] ; then echo "INTERNAL ERROR: version check was not provided a comparison version" exit 1 fi _cur="$2" # needed to handle versions like 1.10 and 1.4-p6 _min="`echo ${_min}. | sed 's/[^0-9]/./g' | sed 's/\.\././g'`" _cur="`echo ${_cur}. | sed 's/[^0-9]/./g' | sed 's/\.\././g'`" _min_major="`echo $_min | cut -d. -f1`" _min_minor="`echo $_min | cut -d. -f2`" _min_patch="`echo $_min | cut -d. -f3`" _cur_major="`echo $_cur | cut -d. -f1`" _cur_minor="`echo $_cur | cut -d. -f2`" _cur_patch="`echo $_cur | cut -d. -f3`" if [ "x$_min_major" = "x" ] ; then _min_major=0 fi if [ "x$_min_minor" = "x" ] ; then _min_minor=0 fi if [ "x$_min_patch" = "x" ] ; then _min_patch=0 fi if [ "x$_cur_minor" = "x" ] ; then _cur_major=0 fi if [ "x$_cur_minor" = "x" ] ; then _cur_minor=0 fi if [ "x$_cur_patch" = "x" ] ; then _cur_patch=0 fi $VERBOSE_ECHO "Checking if ${_cur_major}.${_cur_minor}.${_cur_patch} is greater than ${_min_major}.${_min_minor}.${_min_patch}" if [ $_min_major -lt $_cur_major ] ; then return 0 elif [ $_min_major -eq $_cur_major ] ; then if [ $_min_minor -lt $_cur_minor ] ; then return 0 elif [ $_min_minor -eq $_cur_minor ] ; then if [ $_min_patch -lt $_cur_patch ] ; then return 0 elif [ $_min_patch -eq $_cur_patch ] ; then return 0 fi fi fi return 1 } ###################################### # LOCATE_CONFIGURE_TEMPLATE FUNCTION # ###################################### locate_configure_template ( ) { _pwd="`pwd`" if test -f "./configure.ac" ; then echo "./configure.ac" elif test -f "./configure.in" ; then echo "./configure.in" elif test -f "$_pwd/configure.ac" ; then echo "$_pwd/configure.ac" elif test -f "$_pwd/configure.in" ; then echo "$_pwd/configure.in" elif test -f "$PATH_TO_AUTOGEN/configure.ac" ; then echo "$PATH_TO_AUTOGEN/configure.ac" elif test -f "$PATH_TO_AUTOGEN/configure.in" ; then echo "$PATH_TO_AUTOGEN/configure.in" fi } ################## # argument check # ################## ARGS="$*" PATH_TO_AUTOGEN="`dirname $0`" NAME_OF_AUTOGEN="`basename $0`" AUTOGEN_SH="$PATH_TO_AUTOGEN/$NAME_OF_AUTOGEN" LIBTOOL_M4="${PATH_TO_AUTOGEN}/misc/libtool.m4" if [ "x$HELP" = "x" ] ; then HELP=no fi if [ "x$QUIET" = "x" ] ; then QUIET=no fi if [ "x$VERBOSE" = "x" ] ; then VERBOSE=no fi if [ "x$VERSION_ONLY" = "x" ] ; then VERSION_ONLY=no fi if [ "x$DOWNLOAD" = "x" ] ; then DOWNLOAD=no fi if [ "x$AUTORECONF_OPTIONS" = "x" ] ; then AUTORECONF_OPTIONS="-i -f" fi if [ "x$AUTOCONF_OPTIONS" = "x" ] ; then AUTOCONF_OPTIONS="-f" fi if [ "x$AUTOMAKE_OPTIONS" = "x" ] ; then AUTOMAKE_OPTIONS="-a -c -f" fi ALT_AUTOMAKE_OPTIONS="-a -c" if [ "x$LIBTOOLIZE_OPTIONS" = "x" ] ; then LIBTOOLIZE_OPTIONS="--automake -c -f" fi ALT_LIBTOOLIZE_OPTIONS="--automake --copy --force" if [ "x$ACLOCAL_OPTIONS" = "x" ] ; then ACLOCAL_OPTIONS="" fi if [ "x$AUTOHEADER_OPTIONS" = "x" ] ; then AUTOHEADER_OPTIONS="" fi if [ "x$CONFIG_GUESS_URL" = "x" ] ; then CONFIG_GUESS_URL="http://git.savannah.gnu.org/gitweb/?p=gnulib.git;a=blob_plain;f=build-aux/config.guess;hb=HEAD" fi for arg in $ARGS ; do case "x$arg" in x--help) HELP=yes ;; x-[hH]) HELP=yes ;; x--quiet) QUIET=yes ;; x-[qQ]) QUIET=yes ;; x--verbose) VERBOSE=yes ;; x-[dD]) DOWNLOAD=yes ;; x--download) DOWNLOAD=yes ;; x-[vV]) VERBOSE=yes ;; x--version) VERSION_ONLY=yes ;; *) echo "Unknown option: $arg" echo usage exit 1 ;; esac done ##################### # environment check # ##################### # sanity check before recursions potentially begin if [ ! -f "$AUTOGEN_SH" ] ; then echo "INTERNAL ERROR: $AUTOGEN_SH does not exist" if [ ! "x$0" = "x$AUTOGEN_SH" ] ; then echo "INTERNAL ERROR: dirname/basename inconsistency: $0 != $AUTOGEN_SH" fi exit 1 fi # force locale setting to C so things like date output as expected LC_ALL=C # commands that this script expects for __cmd in echo head tail pwd ; do echo "test" | $__cmd > /dev/null 2>&1 if [ $? != 0 ] ; then echo "INTERNAL ERROR: '${__cmd}' command is required" exit 2 fi done echo "test" | grep "test" > /dev/null 2>&1 if test ! x$? = x0 ; then echo "INTERNAL ERROR: grep command is required" exit 1 fi echo "test" | sed "s/test/test/" > /dev/null 2>&1 if test ! x$? = x0 ; then echo "INTERNAL ERROR: sed command is required" exit 1 fi # determine the behavior of echo case `echo "testing\c"; echo 1,2,3`,`echo -n testing; echo 1,2,3` in *c*,-n*) ECHO_N= ECHO_C=' ' ECHO_T=' ' ;; *c*,* ) ECHO_N=-n ECHO_C= ECHO_T= ;; *) ECHO_N= ECHO_C='\c' ECHO_T= ;; esac # determine the behavior of head case "x`echo 'head' | head -n 1 2>&1`" in *xhead*) HEAD_N="n " ;; *) HEAD_N="" ;; esac # determine the behavior of tail case "x`echo 'tail' | tail -n 1 2>&1`" in *xtail*) TAIL_N="n " ;; *) TAIL_N="" ;; esac VERBOSE_ECHO=: ECHO=: if [ "x$QUIET" = "xyes" ] ; then if [ "x$VERBOSE" = "xyes" ] ; then echo "Verbose output quelled by quiet option. Further output disabled." fi else ECHO=echo if [ "x$VERBOSE" = "xyes" ] ; then echo "Verbose output enabled" VERBOSE_ECHO=echo fi fi # allow a recursive run to disable further recursions if [ "x$RUN_RECURSIVE" = "x" ] ; then RUN_RECURSIVE=yes fi ################################################ # check for help arg and bypass version checks # ################################################ if [ "x`echo $ARGS | sed 's/.*[hH][eE][lL][pP].*/help/'`" = "xhelp" ] ; then HELP=yes fi if [ "x$HELP" = "xyes" ] ; then usage $ECHO "---" $ECHO "Help was requested. No preparation or configuration will be performed." exit 0 fi ####################### # set up signal traps # ####################### untrap_abnormal ( ) { for sig in 1 2 13 15; do trap - $sig done } # do this cleanup whenever we exit. trap ' # start from the root if test -d "$START_PATH" ; then cd "$START_PATH" fi # restore/delete backup files if test "x$PFC_INIT" = "x1" ; then recursive_restore fi ' 0 # trap SIGHUP (1), SIGINT (2), SIGPIPE (13), SIGTERM (15) for sig in 1 2 13 15; do trap ' $ECHO "" $ECHO "Aborting $NAME_OF_AUTOGEN: caught signal '$sig'" # start from the root if test -d "$START_PATH" ; then cd "$START_PATH" fi # clean up on abnormal exit $VERBOSE_ECHO "rm -rf autom4te.cache" rm -rf autom4te.cache if test -f "acinclude.m4.$$.backup" ; then $VERBOSE_ECHO "cat acinclude.m4.$$.backup > acinclude.m4" chmod u+w acinclude.m4 cat acinclude.m4.$$.backup > acinclude.m4 $VERBOSE_ECHO "rm -f acinclude.m4.$$.backup" rm -f acinclude.m4.$$.backup fi { (exit 1); exit 1; } ' $sig done ############################# # look for a configure file # ############################# if [ "x$CONFIGURE" = "x" ] ; then CONFIGURE="`locate_configure_template`" if [ ! "x$CONFIGURE" = "x" ] ; then $VERBOSE_ECHO "Found a configure template: $CONFIGURE" fi else $ECHO "Using CONFIGURE environment variable override: $CONFIGURE" fi if [ "x$CONFIGURE" = "x" ] ; then if [ "x$VERSION_ONLY" = "xyes" ] ; then CONFIGURE=/dev/null else $ECHO $ECHO "A configure.ac or configure.in file could not be located implying" $ECHO "that the GNU Build System is at least not used in this directory. In" $ECHO "any case, there is nothing to do here without one of those files." $ECHO $ECHO "ERROR: No configure.in or configure.ac file found in `pwd`" exit 1 fi fi #################### # get project name # #################### if [ "x$PROJECT" = "x" ] ; then PROJECT="`grep AC_INIT $CONFIGURE | grep -v '.*#.*AC_INIT' | tail -${TAIL_N}1 | sed 's/^[ ]*AC_INIT(\([^,)]*\).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`" if [ "x$PROJECT" = "xAC_INIT" ] ; then # projects might be using the older/deprecated arg-less AC_INIT .. look for AM_INIT_AUTOMAKE instead PROJECT="`grep AM_INIT_AUTOMAKE $CONFIGURE | grep -v '.*#.*AM_INIT_AUTOMAKE' | tail -${TAIL_N}1 | sed 's/^[ ]*AM_INIT_AUTOMAKE(\([^,)]*\).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`" fi if [ "x$PROJECT" = "xAM_INIT_AUTOMAKE" ] ; then PROJECT="project" fi if [ "x$PROJECT" = "x" ] ; then PROJECT="project" fi else $ECHO "Using PROJECT environment variable override: $PROJECT" fi $ECHO "Preparing the $PROJECT build system...please wait" $ECHO ######################## # check for autoreconf # ######################## HAVE_AUTORECONF=no if [ "x$AUTORECONF" = "x" ] ; then for AUTORECONF in autoreconf ; do $VERBOSE_ECHO "Checking autoreconf version: $AUTORECONF --version" $AUTORECONF --version > /dev/null 2>&1 if [ $? = 0 ] ; then HAVE_AUTORECONF=yes break fi done else HAVE_AUTORECONF=yes $ECHO "Using AUTORECONF environment variable override: $AUTORECONF" fi ########################## # autoconf version check # ########################## _acfound=no if [ "x$AUTOCONF" = "x" ] ; then for AUTOCONF in autoconf ; do $VERBOSE_ECHO "Checking autoconf version: $AUTOCONF --version" $AUTOCONF --version > /dev/null 2>&1 if [ $? = 0 ] ; then _acfound=yes break fi done else _acfound=yes $ECHO "Using AUTOCONF environment variable override: $AUTOCONF" fi _report_error=no if [ ! "x$_acfound" = "xyes" ] ; then $ECHO "ERROR: Unable to locate GNU Autoconf." _report_error=yes else _version="`$AUTOCONF --version | head -${HEAD_N}1 | sed 's/[^0-9]*\([0-9\.][0-9\.]*\)/\1/'`" if [ "x$_version" = "x" ] ; then _version="0.0.0" fi $ECHO "Found GNU Autoconf version $_version" version_check "$AUTOCONF_VERSION" "$_version" if [ $? -ne 0 ] ; then _report_error=yes fi fi if [ "x$_report_error" = "xyes" ] ; then version_error "$AUTOCONF_VERSION" "GNU Autoconf" exit 1 fi ########################## # automake version check # ########################## _amfound=no if [ "x$AUTOMAKE" = "x" ] ; then for AUTOMAKE in automake ; do $VERBOSE_ECHO "Checking automake version: $AUTOMAKE --version" $AUTOMAKE --version > /dev/null 2>&1 if [ $? = 0 ] ; then _amfound=yes break fi done else _amfound=yes $ECHO "Using AUTOMAKE environment variable override: $AUTOMAKE" fi _report_error=no if [ ! "x$_amfound" = "xyes" ] ; then $ECHO $ECHO "ERROR: Unable to locate GNU Automake." _report_error=yes else _version="`$AUTOMAKE --version | head -${HEAD_N}1 | sed 's/[^0-9]*\([0-9\.][0-9\.]*\)/\1/'`" if [ "x$_version" = "x" ] ; then _version="0.0.0" fi $ECHO "Found GNU Automake version $_version" version_check "$AUTOMAKE_VERSION" "$_version" if [ $? -ne 0 ] ; then _report_error=yes fi fi if [ "x$_report_error" = "xyes" ] ; then version_error "$AUTOMAKE_VERSION" "GNU Automake" exit 1 fi ######################## # check for libtoolize # ######################## HAVE_LIBTOOLIZE=yes HAVE_ALT_LIBTOOLIZE=no _ltfound=no if [ "x$LIBTOOLIZE" = "x" ] ; then LIBTOOLIZE=libtoolize $VERBOSE_ECHO "Checking libtoolize version: $LIBTOOLIZE --version" $LIBTOOLIZE --version > /dev/null 2>&1 if [ ! $? = 0 ] ; then HAVE_LIBTOOLIZE=no $ECHO if [ "x$HAVE_AUTORECONF" = "xno" ] ; then $ECHO "Warning: libtoolize does not appear to be available." else $ECHO "Warning: libtoolize does not appear to be available. This means that" $ECHO "the automatic build preparation via autoreconf will probably not work." $ECHO "Preparing the build by running each step individually, however, should" $ECHO "work and will be done automatically for you if autoreconf fails." fi # look for some alternates for tool in glibtoolize libtoolize15 libtoolize14 libtoolize13 ; do $VERBOSE_ECHO "Checking libtoolize alternate: $tool --version" _glibtoolize="`$tool --version > /dev/null 2>&1`" if [ $? = 0 ] ; then $VERBOSE_ECHO "Found $tool --version" _glti="`which $tool`" if [ "x$_glti" = "x" ] ; then $VERBOSE_ECHO "Cannot find $tool with which" continue; fi if test ! -f "$_glti" ; then $VERBOSE_ECHO "Cannot use $tool, $_glti is not a file" continue; fi _gltidir="`dirname $_glti`" if [ "x$_gltidir" = "x" ] ; then $VERBOSE_ECHO "Cannot find $tool path with dirname of $_glti" continue; fi if test ! -d "$_gltidir" ; then $VERBOSE_ECHO "Cannot use $tool, $_gltidir is not a directory" continue; fi HAVE_ALT_LIBTOOLIZE=yes LIBTOOLIZE="$tool" $ECHO $ECHO "Fortunately, $tool was found which means that your system may simply" $ECHO "have a non-standard or incomplete GNU Autotools install. If you have" $ECHO "sufficient system access, it may be possible to quell this warning by" $ECHO "running:" $ECHO sudo -V > /dev/null 2>&1 if [ $? = 0 ] ; then $ECHO " sudo ln -s $_glti $_gltidir/libtoolize" $ECHO else $ECHO " ln -s $_glti $_gltidir/libtoolize" $ECHO $ECHO "Run that as root or with proper permissions to the $_gltidir directory" $ECHO fi _ltfound=yes break fi done else _ltfound=yes fi else _ltfound=yes $ECHO "Using LIBTOOLIZE environment variable override: $LIBTOOLIZE" fi ############################ # libtoolize version check # ############################ _report_error=no if [ ! "x$_ltfound" = "xyes" ] ; then $ECHO $ECHO "ERROR: Unable to locate GNU Libtool." _report_error=yes else _version="`$LIBTOOLIZE --version | head -${HEAD_N}1 | sed 's/[^0-9]*\([0-9\.][0-9\.]*\)/\1/'`" if [ "x$_version" = "x" ] ; then _version="0.0.0" fi $ECHO "Found GNU Libtool version $_version" version_check "$LIBTOOL_VERSION" "$_version" if [ $? -ne 0 ] ; then _report_error=yes fi fi if [ "x$_report_error" = "xyes" ] ; then version_error "$LIBTOOL_VERSION" "GNU Libtool" exit 1 fi ##################### # check for aclocal # ##################### if [ "x$ACLOCAL" = "x" ] ; then for ACLOCAL in aclocal ; do $VERBOSE_ECHO "Checking aclocal version: $ACLOCAL --version" $ACLOCAL --version > /dev/null 2>&1 if [ $? = 0 ] ; then break fi done else $ECHO "Using ACLOCAL environment variable override: $ACLOCAL" fi ######################## # check for autoheader # ######################## if [ "x$AUTOHEADER" = "x" ] ; then for AUTOHEADER in autoheader ; do $VERBOSE_ECHO "Checking autoheader version: $AUTOHEADER --version" $AUTOHEADER --version > /dev/null 2>&1 if [ $? = 0 ] ; then break fi done else $ECHO "Using AUTOHEADER environment variable override: $AUTOHEADER" fi ######################### # check if version only # ######################### $VERBOSE_ECHO "Checking whether to only output version information" if [ "x$VERSION_ONLY" = "xyes" ] ; then $ECHO ident $ECHO "---" $ECHO "Version requested. No preparation or configuration will be performed." exit 0 fi ################################# # PROTECT_FROM_CLOBBER FUNCTION # ################################# protect_from_clobber ( ) { PFC_INIT=1 # protect COPYING & INSTALL from overwrite by automake. the # automake force option will (inappropriately) ignore the existing # contents of a COPYING and/or INSTALL files (depending on the # version) instead of just forcing *missing* files like it does # for AUTHORS, NEWS, and README. this is broken but extremely # prevalent behavior, so we protect against it by keeping a backup # of the file that can later be restored. for file in COPYING INSTALL ; do if test -f ${file} ; then if test -f ${file}.$$.protect_from_automake.backup ; then $VERBOSE_ECHO "Already backed up ${file} in `pwd`" else $VERBOSE_ECHO "Backing up ${file} in `pwd`" $VERBOSE_ECHO "cp -p ${file} ${file}.$$.protect_from_automake.backup" cp -p ${file} ${file}.$$.protect_from_automake.backup fi fi done } ############################## # RECURSIVE_PROTECT FUNCTION # ############################## recursive_protect ( ) { # for projects using recursive configure, run the build # preparation steps for the subdirectories. this function assumes # START_PATH was set to pwd before recursion begins so that # relative paths work. # git 'r done, protect COPYING and INSTALL from being clobbered protect_from_clobber if test -d autom4te.cache ; then $VERBOSE_ECHO "Found an autom4te.cache directory, deleting it" $VERBOSE_ECHO "rm -rf autom4te.cache" rm -rf autom4te.cache fi # find configure template _configure="`locate_configure_template`" if [ "x$_configure" = "x" ] ; then return fi # $VERBOSE_ECHO "Looking for configure template found `pwd`/$_configure" # look for subdirs # $VERBOSE_ECHO "Looking for subdirs in `pwd`" _det_config_subdirs="`grep AC_CONFIG_SUBDIRS $_configure | grep -v '.*#.*AC_CONFIG_SUBDIRS' | sed 's/^[ ]*AC_CONFIG_SUBDIRS(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`" CHECK_DIRS="" for dir in $_det_config_subdirs ; do if test -d "`pwd`/$dir" ; then CHECK_DIRS="$CHECK_DIRS \"`pwd`/$dir\"" fi done # process subdirs if [ ! "x$CHECK_DIRS" = "x" ] ; then $VERBOSE_ECHO "Recursively scanning the following directories:" $VERBOSE_ECHO " $CHECK_DIRS" for dir in $CHECK_DIRS ; do $VERBOSE_ECHO "Protecting files from automake in $dir" cd "$START_PATH" eval "cd $dir" # recursively git 'r done recursive_protect done fi } # end of recursive_protect ############################# # RESTORE_CLOBBERED FUNCION # ############################# restore_clobbered ( ) { # The automake (and autoreconf by extension) -f/--force-missing # option may overwrite COPYING and INSTALL even if they do exist. # Here we restore the files if necessary. spacer=no for file in COPYING INSTALL ; do if test -f ${file}.$$.protect_from_automake.backup ; then if test -f ${file} ; then # compare entire content, restore if needed if test "x`cat ${file}`" != "x`cat ${file}.$$.protect_from_automake.backup`" ; then if test "x$spacer" = "xno" ; then $VERBOSE_ECHO spacer=yes fi # restore the backup $VERBOSE_ECHO "Restoring ${file} from backup (automake -f likely clobbered it)" $VERBOSE_ECHO "rm -f ${file}" rm -f ${file} $VERBOSE_ECHO "mv ${file}.$$.protect_from_automake.backup ${file}" mv ${file}.$$.protect_from_automake.backup ${file} fi # check contents elif test -f ${file}.$$.protect_from_automake.backup ; then $VERBOSE_ECHO "mv ${file}.$$.protect_from_automake.backup ${file}" mv ${file}.$$.protect_from_automake.backup ${file} fi # -f ${file} # just in case $VERBOSE_ECHO "rm -f ${file}.$$.protect_from_automake.backup" rm -f ${file}.$$.protect_from_automake.backup fi # -f ${file}.$$.protect_from_automake.backup done CONFIGURE="`locate_configure_template`" if [ "x$CONFIGURE" = "x" ] ; then return fi _aux_dir="`grep AC_CONFIG_AUX_DIR $CONFIGURE | grep -v '.*#.*AC_CONFIG_AUX_DIR' | tail -${TAIL_N}1 | sed 's/^[ ]*AC_CONFIG_AUX_DIR(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`" if test ! -d "$_aux_dir" ; then _aux_dir=. fi for file in config.guess config.sub ltmain.sh ; do if test -f "${_aux_dir}/${file}" ; then $VERBOSE_ECHO "rm -f \"${_aux_dir}/${file}.backup\"" rm -f "${_aux_dir}/${file}.backup" fi done } # end of restore_clobbered ############################## # RECURSIVE_RESTORE FUNCTION # ############################## recursive_restore ( ) { # restore COPYING and INSTALL from backup if they were clobbered # for each directory recursively. # git 'r undone restore_clobbered # find configure template _configure="`locate_configure_template`" if [ "x$_configure" = "x" ] ; then return fi # look for subdirs _det_config_subdirs="`grep AC_CONFIG_SUBDIRS $_configure | grep -v '.*#.*AC_CONFIG_SUBDIRS' | sed 's/^[ ]*AC_CONFIG_SUBDIRS(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`" CHECK_DIRS="" for dir in $_det_config_subdirs ; do if test -d "`pwd`/$dir" ; then CHECK_DIRS="$CHECK_DIRS \"`pwd`/$dir\"" fi done # process subdirs if [ ! "x$CHECK_DIRS" = "x" ] ; then $VERBOSE_ECHO "Recursively scanning the following directories:" $VERBOSE_ECHO " $CHECK_DIRS" for dir in $CHECK_DIRS ; do $VERBOSE_ECHO "Checking files for automake damage in $dir" cd "$START_PATH" eval "cd $dir" # recursively git 'r undone recursive_restore done fi } # end of recursive_restore ####################### # INITIALIZE FUNCTION # ####################### initialize ( ) { # this routine performs a variety of directory-specific # initializations. some are sanity checks, some are preventive, # and some are necessary setup detection. # # this function sets: # CONFIGURE # SEARCH_DIRS # CONFIG_SUBDIRS ################################## # check for a configure template # ################################## CONFIGURE="`locate_configure_template`" if [ "x$CONFIGURE" = "x" ] ; then $ECHO $ECHO "A configure.ac or configure.in file could not be located implying" $ECHO "that the GNU Build System is at least not used in this directory. In" $ECHO "any case, there is nothing to do here without one of those files." $ECHO $ECHO "ERROR: No configure.in or configure.ac file found in `pwd`" exit 1 fi ##################### # detect an aux dir # ##################### _aux_dir="`grep AC_CONFIG_AUX_DIR $CONFIGURE | grep -v '.*#.*AC_CONFIG_AUX_DIR' | tail -${TAIL_N}1 | sed 's/^[ ]*AC_CONFIG_AUX_DIR(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`" if test ! -d "$_aux_dir" ; then _aux_dir=. else $VERBOSE_ECHO "Detected auxillary directory: $_aux_dir" fi ################################ # detect a recursive configure # ################################ CONFIG_SUBDIRS="" _det_config_subdirs="`grep AC_CONFIG_SUBDIRS $CONFIGURE | grep -v '.*#.*AC_CONFIG_SUBDIRS' | sed 's/^[ ]*AC_CONFIG_SUBDIRS(\(.*\)).*/\1/' | sed 's/.*\[\(.*\)\].*/\1/'`" for dir in $_det_config_subdirs ; do if test -d "`pwd`/$dir" ; then $VERBOSE_ECHO "Detected recursive configure directory: `pwd`/$dir" CONFIG_SUBDIRS="$CONFIG_SUBDIRS `pwd`/$dir" fi done ########################################################### # make sure certain required files exist for GNU projects # ########################################################### _marker_found="" _marker_found_message_intro='Detected non-GNU marker "' _marker_found_message_mid='" in ' for marker in foreign cygnus ; do _marker_found_message=${_marker_found_message_intro}${marker}${_marker_found_message_mid} _marker_found="`grep 'AM_INIT_AUTOMAKE.*'${marker} $CONFIGURE`" if [ ! "x$_marker_found" = "x" ] ; then $VERBOSE_ECHO "${_marker_found_message}`basename \"$CONFIGURE\"`" break fi if test -f "`dirname \"$CONFIGURE\"/Makefile.am`" ; then _marker_found="`grep 'AUTOMAKE_OPTIONS.*'${marker} Makefile.am`" if [ ! "x$_marker_found" = "x" ] ; then $VERBOSE_ECHO "${_marker_found_message}Makefile.am" break fi fi done if [ "x${_marker_found}" = "x" ] ; then _suggest_foreign=no for file in AUTHORS COPYING ChangeLog INSTALL NEWS README ; do if [ ! -f $file ] ; then $VERBOSE_ECHO "Touching ${file} since it does not exist" _suggest_foreign=yes touch $file fi done if [ "x${_suggest_foreign}" = "xyes" ] ; then $ECHO $ECHO "Warning: Several files expected of projects that conform to the GNU" $ECHO "coding standards were not found. The files were automatically added" $ECHO "for you since you do not have a 'foreign' declaration specified." $ECHO $ECHO "Considered adding 'foreign' to AM_INIT_AUTOMAKE in `basename \"$CONFIGURE\"`" if test -f "`dirname \"$CONFIGURE\"/Makefile.am`" ; then $ECHO "or to AUTOMAKE_OPTIONS in your top-level Makefile.am file." fi $ECHO fi fi ################################################## # make sure certain generated files do not exist # ################################################## for file in config.guess config.sub ltmain.sh ; do if test -f "${_aux_dir}/${file}" ; then $VERBOSE_ECHO "mv -f \"${_aux_dir}/${file}\" \"${_aux_dir}/${file}.backup\"" mv -f "${_aux_dir}/${file}" "${_aux_dir}/${file}.backup" fi done ############################ # search alternate m4 dirs # ############################ SEARCH_DIRS="" for dir in m4 ; do if [ -d $dir ] ; then $VERBOSE_ECHO "Found extra aclocal search directory: $dir" SEARCH_DIRS="$SEARCH_DIRS -I $dir" fi done ###################################### # remove any previous build products # ###################################### if test -d autom4te.cache ; then $VERBOSE_ECHO "Found an autom4te.cache directory, deleting it" $VERBOSE_ECHO "rm -rf autom4te.cache" rm -rf autom4te.cache fi # tcl/tk (and probably others) have a customized aclocal.m4, so can't delete it # if test -f aclocal.m4 ; then # $VERBOSE_ECHO "Found an aclocal.m4 file, deleting it" # $VERBOSE_ECHO "rm -f aclocal.m4" # rm -f aclocal.m4 # fi } # end of initialize() ############## # initialize # ############## # stash path START_PATH="`pwd`" # Before running autoreconf or manual steps, some prep detection work # is necessary or useful. Only needs to occur once per directory, but # does need to traverse the entire subconfigure hierarchy to protect # files from being clobbered even by autoreconf. recursive_protect # start from where we started cd "$START_PATH" # get ready to process initialize ######################################### # DOWNLOAD_GNULIB_CONFIG_GUESS FUNCTION # ######################################### # TODO - should make sure wget/curl exist and/or work before trying to # use them. download_gnulib_config_guess () { # abuse gitweb to download gnulib's latest config.guess via HTTP config_guess_temp="config.guess.$$.download" ret=1 for __cmd in wget curl fetch ; do $VERBOSE_ECHO "Checking for command ${__cmd}" ${__cmd} --version > /dev/null 2>&1 ret=$? if [ ! $ret = 0 ] ; then continue fi __cmd_version=`${__cmd} --version | head -n 1 | sed -e 's/^[^0-9]\+//' -e 's/ .*//'` $VERBOSE_ECHO "Found ${__cmd} ${__cmd_version}" opts="" case ${__cmd} in wget) opts="-O" ;; curl) opts="-o" ;; fetch) opts="-t 5 -f" ;; esac $VERBOSE_ECHO "Running $__cmd \"${CONFIG_GUESS_URL}\" $opts \"${config_guess_temp}\"" eval "$__cmd \"${CONFIG_GUESS_URL}\" $opts \"${config_guess_temp}\"" > /dev/null 2>&1 if [ $? = 0 ] ; then mv -f "${config_guess_temp}" ${_aux_dir}/config.guess ret=0 break fi done if [ ! $ret = 0 ] ; then $ECHO "Warning: config.guess download failed from: $CONFIG_GUESS_URL" rm -f "${config_guess_temp}" fi } ############################## # LIBTOOLIZE_NEEDED FUNCTION # ############################## libtoolize_needed () { ret=1 # means no, don't need libtoolize for feature in AC_PROG_LIBTOOL AM_PROG_LIBTOOL LT_INIT ; do $VERBOSE_ECHO "Searching for $feature in $CONFIGURE" found="`grep \"^$feature.*\" $CONFIGURE`" if [ ! "x$found" = "x" ] ; then ret=0 # means yes, need to run libtoolize break fi done return ${ret} } ############################################ # prepare build via autoreconf or manually # ############################################ reconfigure_manually=no if [ "x$HAVE_AUTORECONF" = "xyes" ] ; then $ECHO $ECHO $ECHO_N "Automatically preparing build ... $ECHO_C" $VERBOSE_ECHO "$AUTORECONF $SEARCH_DIRS $AUTORECONF_OPTIONS" autoreconf_output="`$AUTORECONF $SEARCH_DIRS $AUTORECONF_OPTIONS 2>&1`" ret=$? $VERBOSE_ECHO "$autoreconf_output" if [ ! $ret = 0 ] ; then if [ "x$HAVE_ALT_LIBTOOLIZE" = "xyes" ] ; then if [ ! "x`echo \"$autoreconf_output\" | grep libtoolize | grep \"No such file or directory\"`" = "x" ] ; then $ECHO $ECHO "Warning: autoreconf failed but due to what is usually a common libtool" $ECHO "misconfiguration issue. This problem is encountered on systems that" $ECHO "have installed libtoolize under a different name without providing a" $ECHO "symbolic link or without setting the LIBTOOLIZE environment variable." $ECHO $ECHO "Restarting the preparation steps with LIBTOOLIZE set to $LIBTOOLIZE" export LIBTOOLIZE RUN_RECURSIVE=no export RUN_RECURSIVE untrap_abnormal $VERBOSE_ECHO sh $AUTOGEN_SH "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9" sh "$AUTOGEN_SH" "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9" exit $? fi fi $ECHO "Warning: $AUTORECONF failed" if test -f ltmain.sh ; then $ECHO "libtoolize being run by autoreconf is not creating ltmain.sh in the auxillary directory like it should" fi $ECHO "Attempting to run the preparation steps individually" reconfigure_manually=yes else if [ "x$DOWNLOAD" = "xyes" ] ; then if libtoolize_needed ; then download_gnulib_config_guess fi fi fi else reconfigure_manually=yes fi ############################ # LIBTOOL_FAILURE FUNCTION # ############################ libtool_failure ( ) { # libtool is rather error-prone in comparison to the other # autotools and this routine attempts to compensate for some # common failures. the output after a libtoolize failure is # parsed for an error related to AC_PROG_LIBTOOL and if found, we # attempt to inject a project-provided libtool.m4 file. _autoconf_output="$1" if [ "x$RUN_RECURSIVE" = "xno" ] ; then # we already tried the libtool.m4, don't try again return 1 fi if test -f "$LIBTOOL_M4" ; then found_libtool="`$ECHO $_autoconf_output | grep AC_PROG_LIBTOOL`" if test ! "x$found_libtool" = "x" ; then if test -f acinclude.m4 ; then rm -f acinclude.m4.$$.backup $VERBOSE_ECHO "cat acinclude.m4 > acinclude.m4.$$.backup" cat acinclude.m4 > acinclude.m4.$$.backup fi $VERBOSE_ECHO "cat \"$LIBTOOL_M4\" >> acinclude.m4" chmod u+w acinclude.m4 cat "$LIBTOOL_M4" >> acinclude.m4 # don't keep doing this RUN_RECURSIVE=no export RUN_RECURSIVE untrap_abnormal $ECHO $ECHO "Restarting the preparation steps with libtool macros in acinclude.m4" $VERBOSE_ECHO sh $AUTOGEN_SH "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9" sh "$AUTOGEN_SH" "$1" "$2" "$3" "$4" "$5" "$6" "$7" "$8" "$9" exit $? fi fi } ########################### # MANUAL_AUTOGEN FUNCTION # ########################### manual_autogen ( ) { ################################################## # Manual preparation steps taken are as follows: # # aclocal [-I m4] # # libtoolize --automake -c -f # # aclocal [-I m4] # # autoconf -f # # autoheader # # automake -a -c -f # ################################################## ########### # aclocal # ########### $VERBOSE_ECHO "$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS" aclocal_output="`$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS 2>&1`" ret=$? $VERBOSE_ECHO "$aclocal_output" if [ ! $ret = 0 ] ; then $ECHO "ERROR: $ACLOCAL failed" && exit 2 ; fi ############## # libtoolize # ############## if libtoolize_needed ; then if [ "x$HAVE_LIBTOOLIZE" = "xyes" ] ; then $VERBOSE_ECHO "$LIBTOOLIZE $LIBTOOLIZE_OPTIONS" libtoolize_output="`$LIBTOOLIZE $LIBTOOLIZE_OPTIONS 2>&1`" ret=$? $VERBOSE_ECHO "$libtoolize_output" if [ ! $ret = 0 ] ; then $ECHO "ERROR: $LIBTOOLIZE failed" && exit 2 ; fi else if [ "x$HAVE_ALT_LIBTOOLIZE" = "xyes" ] ; then $VERBOSE_ECHO "$LIBTOOLIZE $ALT_LIBTOOLIZE_OPTIONS" libtoolize_output="`$LIBTOOLIZE $ALT_LIBTOOLIZE_OPTIONS 2>&1`" ret=$? $VERBOSE_ECHO "$libtoolize_output" if [ ! $ret = 0 ] ; then $ECHO "ERROR: $LIBTOOLIZE failed" && exit 2 ; fi fi fi ########### # aclocal # ########### # re-run again as instructed by libtoolize $VERBOSE_ECHO "$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS" aclocal_output="`$ACLOCAL $SEARCH_DIRS $ACLOCAL_OPTIONS 2>&1`" ret=$? $VERBOSE_ECHO "$aclocal_output" # libtoolize might put ltmain.sh in the wrong place if test -f ltmain.sh ; then if test ! -f "${_aux_dir}/ltmain.sh" ; then $ECHO $ECHO "Warning: $LIBTOOLIZE is creating ltmain.sh in the wrong directory" $ECHO $ECHO "Fortunately, the problem can be worked around by simply copying the" $ECHO "file to the appropriate location (${_aux_dir}/). This has been done for you." $ECHO $VERBOSE_ECHO "cp -p ltmain.sh \"${_aux_dir}/ltmain.sh\"" cp -p ltmain.sh "${_aux_dir}/ltmain.sh" $ECHO $ECHO_N "Continuing build preparation ... $ECHO_C" fi fi # ltmain.sh if [ "x$DOWNLOAD" = "xyes" ] ; then download_gnulib_config_guess fi fi # libtoolize_needed ############ # autoconf # ############ $VERBOSE_ECHO $VERBOSE_ECHO "$AUTOCONF $AUTOCONF_OPTIONS" autoconf_output="`$AUTOCONF $AUTOCONF_OPTIONS 2>&1`" ret=$? $VERBOSE_ECHO "$autoconf_output" if [ ! $ret = 0 ] ; then # retry without the -f and check for usage of macros that are too new ac2_59_macros="AC_C_RESTRICT AC_INCLUDES_DEFAULT AC_LANG_ASSERT AC_LANG_WERROR AS_SET_CATFILE" ac2_55_macros="AC_COMPILER_IFELSE AC_FUNC_MBRTOWC AC_HEADER_STDBOOL AC_LANG_CONFTEST AC_LANG_SOURCE AC_LANG_PROGRAM AC_LANG_CALL AC_LANG_FUNC_TRY_LINK AC_MSG_FAILURE AC_PREPROC_IFELSE" ac2_54_macros="AC_C_BACKSLASH_A AC_CONFIG_LIBOBJ_DIR AC_GNU_SOURCE AC_PROG_EGREP AC_PROG_FGREP AC_REPLACE_FNMATCH AC_FUNC_FNMATCH_GNU AC_FUNC_REALLOC AC_TYPE_MBSTATE_T" macros_to_search="" ac_major="`echo ${AUTOCONF_VERSION}. | cut -d. -f1 | sed 's/[^0-9]//g'`" ac_minor="`echo ${AUTOCONF_VERSION}. | cut -d. -f2 | sed 's/[^0-9]//g'`" if [ $ac_major -lt 2 ] ; then macros_to_search="$ac2_59_macros $ac2_55_macros $ac2_54_macros" else if [ $ac_minor -lt 54 ] ; then macros_to_search="$ac2_59_macros $ac2_55_macros $ac2_54_macros" elif [ $ac_minor -lt 55 ] ; then macros_to_search="$ac2_59_macros $ac2_55_macros" elif [ $ac_minor -lt 59 ] ; then macros_to_search="$ac2_59_macros" fi fi configure_ac_macros=__none__ for feature in $macros_to_search ; do $VERBOSE_ECHO "Searching for $feature in $CONFIGURE" found="`grep \"^$feature.*\" $CONFIGURE`" if [ ! "x$found" = "x" ] ; then if [ "x$configure_ac_macros" = "x__none__" ] ; then configure_ac_macros="$feature" else configure_ac_macros="$feature $configure_ac_macros" fi fi done if [ ! "x$configure_ac_macros" = "x__none__" ] ; then $ECHO $ECHO "Warning: Unsupported macros were found in $CONFIGURE" $ECHO $ECHO "The `basename \"$CONFIGURE\"` file was scanned in order to determine if any" $ECHO "unsupported macros are used that exceed the minimum version" $ECHO "settings specified within this file. As such, the following macros" $ECHO "should be removed from configure.ac or the version numbers in this" $ECHO "file should be increased:" $ECHO $ECHO "$configure_ac_macros" $ECHO $ECHO $ECHO_N "Ignorantly continuing build preparation ... $ECHO_C" fi ################### # autoconf, retry # ################### $VERBOSE_ECHO $VERBOSE_ECHO "$AUTOCONF" autoconf_output="`$AUTOCONF 2>&1`" ret=$? $VERBOSE_ECHO "$autoconf_output" if [ ! $ret = 0 ] ; then # test if libtool is busted libtool_failure "$autoconf_output" # let the user know what went wrong cat < #include #include #include #include #include #include "tap-ctl.h" int tap_ctl_open(const int id, const int minor, const char *params, int flags, const int prt_minor, const char *secondary) { int err; tapdisk_message_t message; memset(&message, 0, sizeof(message)); message.type = TAPDISK_MESSAGE_OPEN; message.cookie = minor; message.u.params.devnum = minor; message.u.params.prt_devnum = prt_minor; message.u.params.flags = flags; err = snprintf(message.u.params.path, sizeof(message.u.params.path) - 1, "%s", params); if (err >= sizeof(message.u.params.path)) { EPRINTF("name too long\n"); return ENAMETOOLONG; } if (secondary) { err = snprintf(message.u.params.secondary, sizeof(message.u.params.secondary) - 1, "%s", secondary); if (err >= sizeof(message.u.params.secondary)) { EPRINTF("secondary image name too long\n"); return ENAMETOOLONG; } } err = tap_ctl_connect_send_and_receive(id, &message, NULL); if (err) return err; switch (message.type) { case TAPDISK_MESSAGE_OPEN_RSP: break; case TAPDISK_MESSAGE_ERROR: err = -message.u.response.error; EPRINTF("open failed, err %d\n", err); break; default: EPRINTF("got unexpected result '%s' from %d\n", tapdisk_message_name(message.type), id); err = EINVAL; } return err; } blktap-2.0.90/control/tap-ctl-spawn.c0000644000000000000000000001071511664745551016115 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include "tap-ctl.h" #include "blktap2.h" static pid_t __tap_ctl_spawn(int *readfd) { int child, channel[2]; char *tapdisk; if (pipe(channel)) { EPRINTF("pipe failed: %d\n", errno); return -errno; } if ((child = fork()) == -1) { EPRINTF("fork failed: %d\n", errno); return -errno; } if (child) { close(channel[1]); *readfd = channel[0]; return child; } if (dup2(channel[1], STDOUT_FILENO) == -1) { EPRINTF("dup2 failed: %d\n", errno); exit(errno); } if (dup2(channel[1], STDERR_FILENO) == -1) { EPRINTF("dup2 failed: %d\n", errno); exit(errno); } close(channel[0]); close(channel[1]); tapdisk = getenv("TAPDISK"); if (!tapdisk) tapdisk = getenv("TAPDISK2"); if (tapdisk) { execlp(tapdisk, tapdisk, NULL); exit(errno); } execl(TAPDISK_EXECDIR "/" TAPDISK_EXEC, TAPDISK_EXEC, NULL); if (errno == ENOENT) execl(TAPDISK_BUILDDIR "/" TAPDISK_EXEC, TAPDISK_EXEC, NULL); exit(errno); } pid_t tap_ctl_get_pid(const int id) { int err; tapdisk_message_t message; memset(&message, 0, sizeof(message)); message.type = TAPDISK_MESSAGE_PID; err = tap_ctl_connect_send_and_receive(id, &message, NULL); if (err) return err; return message.u.tapdisk_pid; } static int tap_ctl_wait(pid_t child) { pid_t pid; int status; pid = waitpid(child, &status, 0); if (pid < 0) { EPRINTF("wait(%d) failed, err %d\n", child, errno); return -errno; } if (WIFEXITED(status)) { int code = WEXITSTATUS(status); if (code) EPRINTF("tapdisk2[%d] failed, status %d\n", child, code); return -code; } if (WIFSIGNALED(status)) { int signo = WTERMSIG(status); EPRINTF("tapdisk2[%d] killed by signal %d\n", child, signo); if (signo == SIGUSR1) /* NB. there's a race between tapdisk's * sigaction init and xen-bugtool shooting * debug signals. If killed by something as * innocuous as USR1, then retry. */ return -EAGAIN; return -EINTR; } EPRINTF("tapdisk2[%d]: unexpected status %#x\n", child, status); return -EAGAIN; } static int tap_ctl_get_child_id(int readfd) { int id; FILE *f; f = fdopen(readfd, "r"); if (!f) { EPRINTF("fdopen failed: %d\n", errno); return -1; } errno = 0; if (fscanf(f, BLKTAP2_CONTROL_DIR"/" BLKTAP2_CONTROL_SOCKET"%d", &id) != 1) { errno = (errno ? : EINVAL); EPRINTF("parsing id failed: %d\n", errno); id = -1; } fclose(f); return id; } int tap_ctl_spawn(void) { pid_t child; int err, id, readfd; readfd = -1; again: child = __tap_ctl_spawn(&readfd); if (child < 0) return child; err = tap_ctl_wait(child); if (err) { if (err == -EAGAIN) goto again; return err; } id = tap_ctl_get_child_id(readfd); if (id < 0) EPRINTF("get_id failed, child %d err %d\n", child, errno); return id; } blktap-2.0.90/control/Makefile.am0000644000000000000000000000226211664745551015311 0ustar rootroot AM_CFLAGS = -Wall AM_CFLAGS += -Werror AM_CPPFLAGS = -D_GNU_SOURCE AM_CPPFLAGS += -DTAPCTL AM_CPPFLAGS += -I$(top_srcdir)/include AM_CPPFLAGS += -DTAPDISK_EXEC='"tapdisk"' AM_CPPFLAGS += -DTAPDISK_EXECDIR='"$(libexecdir)"' AM_CPPFLAGS += -DTAPDISK_BUILDDIR='"$(top_builddir)/drivers"' sbin_PROGRAMS = tap-ctl tap_ctl_LDADD = libblktapctl.la lib_LTLIBRARIES = libblktapctl.la libblktapctl_la_SOURCES = tap-ctl-ipc.c libblktapctl_la_SOURCES += tap-ctl-list.c libblktapctl_la_SOURCES += tap-ctl-allocate.c libblktapctl_la_SOURCES += tap-ctl-free.c libblktapctl_la_SOURCES += tap-ctl-create.c libblktapctl_la_SOURCES += tap-ctl-destroy.c libblktapctl_la_SOURCES += tap-ctl-spawn.c libblktapctl_la_SOURCES += tap-ctl-attach.c libblktapctl_la_SOURCES += tap-ctl-detach.c libblktapctl_la_SOURCES += tap-ctl-open.c libblktapctl_la_SOURCES += tap-ctl-close.c libblktapctl_la_SOURCES += tap-ctl-pause.c libblktapctl_la_SOURCES += tap-ctl-unpause.c libblktapctl_la_SOURCES += tap-ctl-major.c libblktapctl_la_SOURCES += tap-ctl-check.c libblktapctl_la_SOURCES += tap-ctl-stats.c libblktapctl_la_LDFLAGS = -version-info 1:1:1 udev_rulesdir = $(sysconfdir)/udev/rules.d dist_udev_rules_DATA = blktap.rules blktap-2.0.90/control/tap-ctl-destroy.c0000644000000000000000000000406611664745551016460 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "tap-ctl.h" #include "blktap2.h" int tap_ctl_destroy(const int id, const int minor, int force, struct timeval *timeout) { int err; err = tap_ctl_close(id, minor, 0, timeout); if (err) return err; err = tap_ctl_detach(id, minor); if (err) return err; err = tap_ctl_free(minor); if (err) return err; return 0; } blktap-2.0.90/control/tap-ctl-check.c0000644000000000000000000000446011664745551016042 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include "tap-ctl.h" #include "blktap2.h" int tap_ctl_check_blktap(const char **msg) { FILE *f; int err = 0, minor; char name[32]; memset(name, 0, sizeof(name)); f = fopen("/proc/misc", "r"); if (!f) { *msg = "failed to open /proc/misc"; return -errno; } while (fscanf(f, "%d %32s", &minor, name) == 2) { if (!strcmp(name, BLKTAP2_CONTROL_NAME)) goto out; } err = -ENOSYS; *msg = "blktap kernel module not installed"; out: fclose(f); return err; } int tap_ctl_check(const char **msg) { int err; err = tap_ctl_check_blktap(msg); if (err) goto out; err = 0; *msg = "ok"; out: return err; } blktap-2.0.90/control/tap-ctl-ipc.c0000644000000000000000000001241311664745551015535 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include "tap-ctl.h" #include "blktap2.h" int tap_ctl_debug = 0; int tap_ctl_read_raw(int fd, void *buf, size_t size, struct timeval *timeout) { fd_set readfds; size_t offset = 0; int ret; while (offset < size) { FD_ZERO(&readfds); FD_SET(fd, &readfds); ret = select(fd + 1, &readfds, NULL, NULL, timeout); if (ret == -1) break; else if (FD_ISSET(fd, &readfds)) { ret = read(fd, buf + offset, size - offset); if (ret <= 0) break; offset += ret; } else break; } if (offset != size) { EPRINTF("failure reading data %zd/%zd\n", offset, size); return -EIO; } return 0; } int tap_ctl_read_message(int fd, tapdisk_message_t *message, struct timeval *timeout) { size_t size = sizeof(tapdisk_message_t); int err; err = tap_ctl_read_raw(fd, message, size, timeout); if (err) return err; DBG("received '%s' message (uuid = %u)\n", tapdisk_message_name(message->type), message->cookie); return 0; } int tap_ctl_write_message(int fd, tapdisk_message_t *message, struct timeval *timeout) { fd_set writefds; int ret, len, offset; offset = 0; len = sizeof(tapdisk_message_t); DBG("sending '%s' message (uuid = %u)\n", tapdisk_message_name(message->type), message->cookie); while (offset < len) { FD_ZERO(&writefds); FD_SET(fd, &writefds); /* we don't bother reinitializing tv. at worst, it will wait a * bit more time than expected. */ ret = select(fd + 1, NULL, &writefds, NULL, timeout); if (ret == -1) break; else if (FD_ISSET(fd, &writefds)) { ret = write(fd, message + offset, len - offset); if (ret <= 0) break; offset += ret; } else break; } if (offset != len) { EPRINTF("failure writing message\n"); return -EIO; } return 0; } int tap_ctl_send_and_receive(int sfd, tapdisk_message_t *message, struct timeval *timeout) { int err; err = tap_ctl_write_message(sfd, message, timeout); if (err) { EPRINTF("failed to send '%s' message\n", tapdisk_message_name(message->type)); return err; } err = tap_ctl_read_message(sfd, message, timeout); if (err) { EPRINTF("failed to receive '%s' message\n", tapdisk_message_name(message->type)); return err; } return 0; } char * tap_ctl_socket_name(int id) { char *name; if (asprintf(&name, "%s/%s%d", BLKTAP2_CONTROL_DIR, BLKTAP2_CONTROL_SOCKET, id) == -1) return NULL; return name; } int tap_ctl_connect(const char *name, int *sfd) { int fd, err; struct sockaddr_un saddr; *sfd = -1; fd = socket(AF_UNIX, SOCK_STREAM, 0); if (fd == -1) { EPRINTF("couldn't create socket for %s: %d\n", name, errno); return -errno; } memset(&saddr, 0, sizeof(saddr)); saddr.sun_family = AF_UNIX; strcpy(saddr.sun_path, name); err = connect(fd, (const struct sockaddr *)&saddr, sizeof(saddr)); if (err) { EPRINTF("couldn't connect to %s: %d\n", name, errno); close(fd); return -errno; } *sfd = fd; return 0; } int tap_ctl_connect_id(int id, int *sfd) { int err; char *name; *sfd = -1; if (id < 0) { EPRINTF("invalid id %d\n", id); return -EINVAL; } name = tap_ctl_socket_name(id); if (!name) { EPRINTF("couldn't name socket for %d\n", id); return -ENOMEM; } err = tap_ctl_connect(name, sfd); free(name); return err; } int tap_ctl_connect_send_and_receive(int id, tapdisk_message_t *message, struct timeval *timeout) { int err, sfd; err = tap_ctl_connect_id(id, &sfd); if (err) return err; err = tap_ctl_send_and_receive(sfd, message, timeout); close(sfd); return err; } blktap-2.0.90/control/blktap.rules0000644000000000000000000000026611664745551015610 0ustar rootrootSUBSYSTEM=="misc", KERNEL=="blktap-control", NAME="blktap/control" SUBSYSTEM=="blktap2", KERNEL=="blktap[0-9]*", NAME="blktap/%k" SUBSYSTEM=="block", KERNEL=="td[a-z]*", NAME="%k" blktap-2.0.90/control/tap-ctl-attach.c0000644000000000000000000000442211664745551016227 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "tap-ctl.h" int tap_ctl_attach(const int id, const int minor) { int err; tapdisk_message_t message; memset(&message, 0, sizeof(message)); message.type = TAPDISK_MESSAGE_ATTACH; message.cookie = minor; err = tap_ctl_connect_send_and_receive(id, &message, NULL); if (err) return err; if (message.type == TAPDISK_MESSAGE_ATTACH_RSP) { err = message.u.response.error; if (err) EPRINTF("attach failed: %d\n", err); } else { EPRINTF("got unexpected result '%s' from %d\n", tapdisk_message_name(message.type), id); err = EINVAL; } return err; } blktap-2.0.90/control/tap-ctl-close.c0000644000000000000000000000462311664745551016073 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "tap-ctl.h" int tap_ctl_close(const int id, const int minor, const int force, struct timeval *timeout) { int err; tapdisk_message_t message; memset(&message, 0, sizeof(message)); message.type = TAPDISK_MESSAGE_CLOSE; if (force) message.type = TAPDISK_MESSAGE_FORCE_SHUTDOWN; message.cookie = minor; err = tap_ctl_connect_send_and_receive(id, &message, timeout); if (err) return err; if (message.type == TAPDISK_MESSAGE_CLOSE_RSP) { err = message.u.response.error; if (err) EPRINTF("close failed: %d\n", err); } else { EPRINTF("got unexpected result '%s' from %d\n", tapdisk_message_name(message.type), id); err = EINVAL; } return err; } blktap-2.0.90/control/tap-ctl-create.c0000644000000000000000000000437711664745551016237 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "tap-ctl.h" #include "blktap2.h" int tap_ctl_create(const char *params, char **devname, int flags, int parent_minor, char *secondary) { int err, id, minor; err = tap_ctl_allocate(&minor, devname); if (err) return err; id = tap_ctl_spawn(); if (id < 0) { err = id; goto destroy; } err = tap_ctl_attach(id, minor); if (err) goto destroy; err = tap_ctl_open(id, minor, params, flags, parent_minor, secondary); if (err) goto detach; return 0; detach: tap_ctl_detach(id, minor); destroy: tap_ctl_free(minor); return err; } blktap-2.0.90/control/tap-ctl-allocate.c0000644000000000000000000001225211664745551016547 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "tap-ctl.h" #include "blktap2.h" static int tap_ctl_prepare_directory(const char *dir) { int err; char *ptr, *name, *start; err = access(dir, W_OK | R_OK); if (!err) return 0; name = strdup(dir); if (!name) return ENOMEM; start = name; for (;;) { ptr = strchr(start + 1, '/'); if (ptr) *ptr = '\0'; err = mkdir(name, 0755); if (err && errno != EEXIST) { PERROR("mkdir %s", name); err = errno; break; } if (!ptr) break; else { *ptr = '/'; start = ptr + 1; } } free(name); return err; } static int tap_ctl_make_device(const char *devname, const int major, const int minor, const int perm) { int err; char *copy, *dir; copy = strdup(devname); if (!copy) return ENOMEM; dir = dirname(copy); err = tap_ctl_prepare_directory(dir); free(copy); if (err) return err; if (!access(devname, F_OK)) if (unlink(devname)) { PERROR("unlink %s", devname); return errno; } err = mknod(devname, perm, makedev(major, minor)); if (err) { PERROR("mknod %s", devname); return errno; } return 0; } static int tap_ctl_check_environment(void) { FILE *f; int err, minor; char name[256]; err = tap_ctl_prepare_directory(BLKTAP2_CONTROL_DIR); if (err) return err; if (!access(BLKTAP2_CONTROL_DEVICE, R_OK | W_OK)) return 0; memset(name, 0, sizeof(name)); f = fopen("/proc/misc", "r"); if (!f) { EPRINTF("failed to open /proc/misc: %d\n", errno); return errno; } while (fscanf(f, "%d %256s", &minor, name) == 2) if (!strcmp(name, BLKTAP2_CONTROL_NAME)) { err = tap_ctl_make_device(BLKTAP2_CONTROL_DEVICE, MISC_MAJOR, minor, S_IFCHR | 0600); goto out; } err = ENOSYS; EPRINTF("didn't find %s in /proc/misc\n", BLKTAP2_CONTROL_NAME); out: fclose(f); return err; } static int tap_ctl_allocate_device(int *minor, char **devname) { char *name; int fd, err; struct blktap2_handle handle; *minor = -1; if (!devname) return EINVAL; fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY); if (fd == -1) { EPRINTF("failed to open control device: %d\n", errno); return errno; } err = ioctl(fd, BLKTAP2_IOCTL_ALLOC_TAP, &handle); close(fd); if (err == -1) { EPRINTF("failed to allocate new device: %d\n", errno); return errno; } err = asprintf(&name, "%s%d", BLKTAP2_RING_DEVICE, handle.minor); if (err == -1) { err = ENOMEM; goto fail; } err = tap_ctl_make_device(name, handle.ring, handle.minor, S_IFCHR | 0600); free(name); if (err) { EPRINTF("creating ring device for %d failed: %d\n", handle.minor, err); goto fail; } if (*devname) name = *devname; else { err = asprintf(&name, "%s%d", BLKTAP2_IO_DEVICE, handle.minor); if (err == -1) { err = ENOMEM; goto fail; } *devname = name; } err = tap_ctl_make_device(name, handle.device, handle.minor, S_IFBLK | 0600); if (err) { EPRINTF("creating IO device for %d failed: %d\n", handle.minor, err); goto fail; } DBG("new interface: ring: %u, device: %u, minor: %u\n", handle.ring, handle.device, handle.minor); *minor = handle.minor; return 0; fail: tap_ctl_free(handle.minor); return err; } int tap_ctl_allocate(int *minor, char **devname) { int err; *minor = -1; err = tap_ctl_check_environment(); if (err) return err; err = tap_ctl_allocate_device(minor, devname); if (err) return err; return 0; } blktap-2.0.90/control/tap-ctl-pause.c0000644000000000000000000000436411664745551016105 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "tap-ctl.h" int tap_ctl_pause(const int id, const int minor, struct timeval *timeout) { int err; tapdisk_message_t message; memset(&message, 0, sizeof(message)); message.type = TAPDISK_MESSAGE_PAUSE; message.cookie = minor; err = tap_ctl_connect_send_and_receive(id, &message, timeout); if (err) return err; if (message.type == TAPDISK_MESSAGE_PAUSE_RSP) err = message.u.response.error; else { err = EINVAL; EPRINTF("got unexpected result '%s' from %d\n", tapdisk_message_name(message.type), id); } return err; } blktap-2.0.90/control/tap-ctl-detach.c0000644000000000000000000000443211664745551016214 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "tap-ctl.h" int tap_ctl_detach(const int id, const int minor) { int err; tapdisk_message_t message; memset(&message, 0, sizeof(message)); message.type = TAPDISK_MESSAGE_DETACH; message.cookie = minor; err = tap_ctl_connect_send_and_receive(id, &message, NULL); if (err) return err; if (message.type == TAPDISK_MESSAGE_DETACH_RSP) { err = message.u.response.error; if (err < 0) printf("detach failed: %d\n", err); } else { printf("got unexpected result '%s' from %d\n", tapdisk_message_name(message.type), id); err = EINVAL; } return err; } blktap-2.0.90/control/tap-ctl-unpause.c0000644000000000000000000000454211664745551016446 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "tap-ctl.h" int tap_ctl_unpause(const int id, const int minor, const char *params) { int err; tapdisk_message_t message; memset(&message, 0, sizeof(message)); message.type = TAPDISK_MESSAGE_RESUME; message.cookie = minor; if (params) strncpy(message.u.params.path, params, sizeof(message.u.params.path) - 1); err = tap_ctl_connect_send_and_receive(id, &message, NULL); if (err) return err; if (message.type == TAPDISK_MESSAGE_RESUME_RSP) err = message.u.response.error; else { err = EINVAL; EPRINTF("got unexpected result '%s' from %d\n", tapdisk_message_name(message.type), id); } return err; } blktap-2.0.90/control/tap-ctl-stats.c0000644000000000000000000000727311664745551016130 0ustar rootroot/* * Copyright (c) 2010, Citrix * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include "tap-ctl.h" int _tap_ctl_stats_connect_and_send(pid_t pid, int minor) { struct timeval timeout = { .tv_sec = 10, .tv_usec = 0 }; tapdisk_message_t message; int sfd, err; err = tap_ctl_connect_id(pid, &sfd); if (err) return err; memset(&message, 0, sizeof(message)); message.type = TAPDISK_MESSAGE_STATS; message.cookie = minor; err = tap_ctl_write_message(sfd, &message, &timeout); if (err) return err; return sfd; } ssize_t tap_ctl_stats(pid_t pid, int minor, char *buf, size_t size) { tapdisk_message_t message; int sfd, err; size_t len; sfd = _tap_ctl_stats_connect_and_send(pid, minor); if (sfd < 0) return sfd; err = tap_ctl_read_message(sfd, &message, NULL); if (err) return err; len= message.u.info.length; if (len < 0) { err = len; goto out; } if (size < len + 1) len = size - 1; err = tap_ctl_read_raw(sfd, buf, len, NULL); if (err) goto out; buf[len] = 0; out: close(sfd); return err; } int tap_ctl_stats_fwrite(pid_t pid, int minor, FILE *stream) { tapdisk_message_t message; int sfd = -1, prot, flags, err; size_t len, bufsz; char *buf = MAP_FAILED; prot = PROT_READ|PROT_WRITE; flags = MAP_ANONYMOUS|MAP_PRIVATE; bufsz = sysconf(_SC_PAGE_SIZE); buf = mmap(NULL, bufsz, prot, flags, -1, 0); if (buf == MAP_FAILED) { buf = NULL; err = -ENOMEM; goto out; } sfd = _tap_ctl_stats_connect_and_send(pid, minor); if (sfd < 0) { err = sfd; goto out; } err = tap_ctl_read_message(sfd, &message, NULL); if (err) goto out; len = message.u.info.length; err = len; if (len < 0) goto out; while (len) { fd_set rfds; size_t in, out; int n; FD_ZERO(&rfds); FD_SET(sfd, &rfds); n = select(sfd + 1, &rfds, NULL, NULL, NULL); err = n; if (n < 0) goto out; in = read(sfd, buf, bufsz); err = in; if (in <= 0) goto out; len -= in; out = fwrite(buf, in, 1, stream); if (out != in) { err = -errno; goto out; } } out: if (sfd >= 0) close(sfd); if (buf != MAP_FAILED) munmap(buf, bufsz); return err; } blktap-2.0.90/control/tap-ctl-list.c0000644000000000000000000001701711664745551015742 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "tap-ctl.h" #include "blktap2.h" #include "list.h" static tap_list_t* _tap_list_alloc(void) { const size_t sz = sizeof(tap_list_t); tap_list_t *tl; tl = malloc(sz); if (!tl) return NULL; tl->pid = -1; tl->minor = -1; tl->state = -1; tl->type = NULL; tl->path = NULL; INIT_LIST_HEAD(&tl->entry); return tl; } static void _tap_list_free(tap_list_t *tl) { list_del_init(&tl->entry); if (tl->type) { free(tl->type); tl->type = NULL; } if (tl->path) { free(tl->path); tl->path = NULL; } free(tl); } int _parse_params(const char *params, char **type, char **path) { char *ptr; size_t len; ptr = strchr(params, ':'); if (!ptr) return -EINVAL; len = ptr - params; *type = strndup(params, len); *path = strdup(params + len + 1); if (!*type || !*path) { free(*type); *type = NULL; free(*path); *path = NULL; return -errno; } return 0; } void tap_ctl_list_free(struct list_head *list) { tap_list_t *tl, *n; tap_list_for_each_entry_safe(tl, n, list) _tap_list_free(tl); } static int _tap_ctl_find_minors(struct list_head *list) { const char *pattern, *format; glob_t glbuf = { 0 }; tap_list_t *tl; int i, err; INIT_LIST_HEAD(list); pattern = BLKTAP2_SYSFS_DIR"/blktap*"; format = BLKTAP2_SYSFS_DIR"/blktap%d"; err = glob(pattern, 0, NULL, &glbuf); switch (err) { case GLOB_NOMATCH: goto done; case GLOB_ABORTED: case GLOB_NOSPACE: err = -errno; EPRINTF("%s: glob failed, err %d", pattern, err); goto fail; } for (i = 0; i < glbuf.gl_pathc; ++i) { int n; tl = _tap_list_alloc(); if (!tl) { err = -ENOMEM; goto fail; } n = sscanf(glbuf.gl_pathv[i], format, &tl->minor); if (n != 1) { _tap_list_free(tl); continue; } list_add_tail(&tl->entry, list); } done: err = 0; out: if (glbuf.gl_pathv) globfree(&glbuf); return err; fail: tap_ctl_list_free(list); goto out; } int _tap_ctl_find_tapdisks(struct list_head *list) { const char *pattern, *format; glob_t glbuf = { 0 }; int err, i, n_taps = 0; pattern = BLKTAP2_CONTROL_DIR"/"BLKTAP2_CONTROL_SOCKET"*"; format = BLKTAP2_CONTROL_DIR"/"BLKTAP2_CONTROL_SOCKET"%d"; INIT_LIST_HEAD(list); err = glob(pattern, 0, NULL, &glbuf); switch (err) { case GLOB_NOMATCH: goto done; case GLOB_ABORTED: case GLOB_NOSPACE: err = -errno; EPRINTF("%s: glob failed, err %d", pattern, err); goto fail; } for (i = 0; i < glbuf.gl_pathc; ++i) { tap_list_t *tl; int n; tl = _tap_list_alloc(); if (!tl) { err = -ENOMEM; goto fail; } n = sscanf(glbuf.gl_pathv[i], format, &tl->pid); if (n != 1) goto skip; tl->pid = tap_ctl_get_pid(tl->pid); if (tl->pid < 0) goto skip; list_add_tail(&tl->entry, list); n_taps++; continue; skip: _tap_list_free(tl); } done: err = 0; out: if (glbuf.gl_pathv) globfree(&glbuf); return err ? : n_taps; fail: tap_ctl_list_free(list); goto out; } int _tap_ctl_list_tapdisk(pid_t pid, struct list_head *list) { struct timeval timeout = { .tv_sec = 10, .tv_usec = 0 }; tapdisk_message_t message; tap_list_t *tl; int err, sfd; err = tap_ctl_connect_id(pid, &sfd); if (err) return err; memset(&message, 0, sizeof(message)); message.type = TAPDISK_MESSAGE_LIST; message.cookie = -1; err = tap_ctl_write_message(sfd, &message, &timeout); if (err) return err; INIT_LIST_HEAD(list); do { err = tap_ctl_read_message(sfd, &message, &timeout); if (err) { err = -EPROTO; goto fail; } if (message.u.list.count == 0) break; tl = _tap_list_alloc(); if (!tl) { err = -ENOMEM; goto fail; } tl->pid = pid; tl->minor = message.u.list.minor; tl->state = message.u.list.state; if (message.u.list.path[0] != 0) { err = _parse_params(message.u.list.path, &tl->type, &tl->path); if (err) { _tap_list_free(tl); goto fail; } } list_add(&tl->entry, list); } while (1); err = 0; out: close(sfd); return 0; fail: tap_ctl_list_free(list); goto out; } int tap_ctl_list(struct list_head *list) { struct list_head minors, tapdisks, vbds; tap_list_t *t, *next_t, *v, *next_v, *m, *next_m; int err; /* * Find all minors, find all tapdisks, then list all minors * they attached to. Output is a 3-way outer join. */ err = _tap_ctl_find_minors(&minors); if (err < 0) goto fail; err = _tap_ctl_find_tapdisks(&tapdisks); if (err < 0) goto fail; INIT_LIST_HEAD(list); tap_list_for_each_entry_safe(t, next_t, &tapdisks) { err = _tap_ctl_list_tapdisk(t->pid, &vbds); if (err || list_empty(&vbds)) { list_move_tail(&t->entry, list); continue; } tap_list_for_each_entry_safe(v, next_v, &vbds) { tap_list_for_each_entry_safe(m, next_m, &minors) if (m->minor == v->minor) { _tap_list_free(m); break; } list_move_tail(&v->entry, list); } _tap_list_free(t); } /* orphaned minors */ list_splice_tail(&minors, list); return 0; fail: tap_ctl_list_free(list); tap_ctl_list_free(&vbds); tap_ctl_list_free(&tapdisks); tap_ctl_list_free(&minors); return err; } int tap_ctl_list_pid(pid_t pid, struct list_head *list) { tap_list_t *t; int err; t = _tap_list_alloc(); if (!t) return -ENOMEM; t->pid = tap_ctl_get_pid(pid); if (t->pid < 0) { _tap_list_free(t); return 0; } err = _tap_ctl_list_tapdisk(t->pid, list); if (err || list_empty(list)) list_add_tail(&t->entry, list); return 0; } int tap_ctl_find_minor(const char *type, const char *path) { struct list_head list = LIST_HEAD_INIT(list); tap_list_t *entry; int minor, err; err = tap_ctl_list(&list); if (err) return err; minor = -1; tap_list_for_each_entry(entry, &list) { if (type && (!entry->type || strcmp(entry->type, type))) continue; if (path && (!entry->path || strcmp(entry->path, path))) continue; minor = entry->minor; break; } tap_ctl_list_free(&list); return minor >= 0 ? minor : -ENOENT; } blktap-2.0.90/control/tap-ctl-major.c0000644000000000000000000000416311664745551016075 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include "tap-ctl.h" int tap_ctl_blk_major(void) { FILE *devices; int rv, major; devices = fopen("/proc/devices", "r"); if (!devices) { rv = -errno; goto out; } do { char buf[32], *s; int n, offset; s = fgets(buf, sizeof(buf), devices); if (!s) break; major = -ENODEV; offset = 0; n = sscanf(buf, "%d tapdev%n", &major, &offset); if (n == 1 && offset) break; } while (1); rv = major; out: if (devices) fclose(devices); return rv; } blktap-2.0.90/control/tap-ctl.c0000644000000000000000000004207511664745551014773 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include "tap-ctl.h" typedef int (*tap_ctl_func_t) (int, char **); struct command { char *name; tap_ctl_func_t func; }; static void tap_cli_list_usage(FILE *stream) { fprintf(stream, "usage: list [-h] [-p pid] [-m minor] [-t type] [-f file]\n"); } static void tap_cli_list_row(tap_list_t *entry) { char minor_str[10] = "-"; char state_str[10] = "-"; char pid_str[10] = "-"; if (entry->pid != -1) sprintf(pid_str, "%d", entry->pid); if (entry->minor != -1) sprintf(minor_str, "%d", entry->minor); if (entry->state != -1) sprintf(state_str, "%#x", entry->state); printf("%8s %4s %4s %10s %s\n", pid_str, minor_str, state_str, entry->type ? : "-", entry->path ? : "-"); } static void tap_cli_list_dict(tap_list_t *entry) { int d = 0; if (entry->pid != -1) { if (d) putc(' ', stdout); d = printf("pid=%d", entry->pid); } if (entry->minor != -1) { if (d) putc(' ', stdout); d = printf("minor=%d", entry->minor); } if (entry->state != -1) { if (d) putc(' ', stdout); d = printf("state=%#x", entry->state); } if (entry->type && entry->path) { if (d) putc(' ', stdout); d = printf("args=%s:%s", entry->type, entry->path); } putc('\n', stdout); } int tap_cli_list(int argc, char **argv) { struct list_head list = LIST_HEAD_INIT(list); int c, minor, tty, err; const char *type, *file; tap_list_t *entry; pid_t pid; pid = -1; minor = -1; type = NULL; file = NULL; while ((c = getopt(argc, argv, "m:p:t:f:h")) != -1) { switch (c) { case 'm': minor = atoi(optarg); break; case 'p': pid = atoi(optarg); break; case 't': type = optarg; break; case 'f': file = optarg; break; case '?': goto usage; case 'h': tap_cli_list_usage(stdout); return 0; } } if (pid != -1) err = tap_ctl_list_pid(pid, &list); else err = tap_ctl_list(&list); if (err) return -err; tty = isatty(STDOUT_FILENO); tap_list_for_each_entry(entry, &list) { if (minor >= 0 && entry->minor != minor) continue; if (pid >= 0 && entry->pid != pid) continue; if (type && entry->type && strcmp(entry->type, type)) continue; if (file && entry->path && strcmp(entry->path, file)) continue; if (tty) tap_cli_list_row(entry); else tap_cli_list_dict(entry); } tap_ctl_list_free(&list); return 0; usage: tap_cli_list_usage(stderr); return EINVAL; } static void tap_cli_allocate_usage(FILE *stream) { fprintf(stream, "usage: allocate [-d device name]>\n"); } static int tap_cli_allocate(int argc, char **argv) { char *devname; int c, minor, err; devname = NULL; optind = 0; while ((c = getopt(argc, argv, "d:h")) != -1) { switch (c) { case 'd': devname = optarg; break; case '?': goto usage; case 'h': tap_cli_allocate_usage(stdout); return 0; } } err = tap_ctl_allocate(&minor, &devname); if (!err) printf("%s\n", devname); return err; usage: tap_cli_allocate_usage(stderr); return EINVAL; } static void tap_cli_free_usage(FILE *stream) { fprintf(stream, "usage: free <-m minor>\n"); } static int tap_cli_free(int argc, char **argv) { int c, minor; minor = -1; optind = 0; while ((c = getopt(argc, argv, "m:h")) != -1) { switch (c) { case 'm': minor = atoi(optarg); break; case '?': goto usage; case 'h': tap_cli_free_usage(stdout); return 0; } } if (minor == -1) goto usage; return tap_ctl_free(minor); usage: tap_cli_free_usage(stderr); return EINVAL; } static void tap_cli_create_usage(FILE *stream) { fprintf(stream, "usage: create <-a args> [-d device name] [-R readonly] " "[-e stack on existing tapdisk for the parent chain] " "[-r turn on read caching into leaf node] [-2 " "use secondary image (in mirror mode if no -s)] [-s " "fail over to the secondary image on ENOSPC]\n"); } static int tap_cli_create(int argc, char **argv) { int c, err, flags, prt_minor; char *args, *devname, *secondary; args = NULL; devname = NULL; secondary = NULL; prt_minor = -1; flags = 0; optind = 0; while ((c = getopt(argc, argv, "a:Rd:e:r2:sh")) != -1) { switch (c) { case 'a': args = optarg; break; case 'd': devname = optarg; break; case 'R': flags |= TAPDISK_MESSAGE_FLAG_RDONLY; break; case 'r': flags |= TAPDISK_MESSAGE_FLAG_ADD_LCACHE; break; case 'e': flags |= TAPDISK_MESSAGE_FLAG_REUSE_PRT; prt_minor = atoi(optarg); break; case '2': flags |= TAPDISK_MESSAGE_FLAG_SECONDARY; secondary = optarg; break; case 's': flags |= TAPDISK_MESSAGE_FLAG_STANDBY; break; case '?': goto usage; case 'h': tap_cli_create_usage(stdout); return 0; } } if (!args) goto usage; err = tap_ctl_create(args, &devname, flags, prt_minor, secondary); if (!err) printf("%s\n", devname); return err; usage: tap_cli_create_usage(stderr); return EINVAL; } static void tap_cli_destroy_usage(FILE *stream) { fprintf(stream, "usage: destroy <-p pid> <-m minor>\n"); } static struct timeval* tap_cli_timeout(const char *optarg) { static struct timeval tv; struct timeval now; tv.tv_sec = atoi(optarg); tv.tv_usec = 0; gettimeofday(&now, NULL); timeradd(&tv, &now, &tv); return &tv; } static int tap_cli_destroy(int argc, char **argv) { int c, pid, minor; struct timeval *timeout; pid = -1; minor = -1; timeout = NULL; optind = 0; while ((c = getopt(argc, argv, "p:m:t:h")) != -1) { switch (c) { case 'p': pid = atoi(optarg); break; case 'm': minor = atoi(optarg); break; case 't': timeout = tap_cli_timeout(optarg); if (!timeout) goto usage; break; case '?': goto usage; case 'h': tap_cli_destroy_usage(stdout); return 0; } } if (pid == -1 || minor == -1) goto usage; return tap_ctl_destroy(pid, minor, 0, timeout); usage: tap_cli_destroy_usage(stderr); return EINVAL; } static void tap_cli_spawn_usage(FILE *stream) { fprintf(stream, "usage: spawn\n"); } static int tap_cli_spawn(int argc, char **argv) { int c, tty; pid_t pid; optind = 0; while ((c = getopt(argc, argv, "h")) != -1) { switch (c) { case '?': goto usage; case 'h': tap_cli_spawn_usage(stdout); return 0; } } pid = tap_ctl_spawn(); if (pid < 0) return pid; tty = isatty(STDOUT_FILENO); if (tty) printf("tapdisk spawned with pid %d\n", pid); else printf("%d\n", pid); return 0; usage: tap_cli_spawn_usage(stderr); return EINVAL; } static void tap_cli_attach_usage(FILE *stream) { fprintf(stream, "usage: attach <-p pid> <-m minor>\n"); } static int tap_cli_attach(int argc, char **argv) { int c, pid, minor; pid = -1; minor = -1; optind = 0; while ((c = getopt(argc, argv, "p:m:h")) != -1) { switch (c) { case 'p': pid = atoi(optarg); break; case 'm': minor = atoi(optarg); break; case '?': goto usage; case 'h': tap_cli_attach_usage(stderr); return 0; } } if (pid == -1 || minor == -1) goto usage; return tap_ctl_attach(pid, minor); usage: tap_cli_attach_usage(stderr); return EINVAL; } static void tap_cli_detach_usage(FILE *stream) { fprintf(stream, "usage: detach <-p pid> <-m minor>\n"); } static int tap_cli_detach(int argc, char **argv) { int c, pid, minor; pid = -1; minor = -1; optind = 0; while ((c = getopt(argc, argv, "p:m:h")) != -1) { switch (c) { case 'p': pid = atoi(optarg); break; case 'm': minor = atoi(optarg); break; case '?': goto usage; case 'h': tap_cli_detach_usage(stdout); return 0; } } if (pid == -1 || minor == -1) goto usage; return tap_ctl_detach(pid, minor); usage: tap_cli_detach_usage(stderr); return EINVAL; } static void tap_cli_close_usage(FILE *stream) { fprintf(stream, "usage: close <-p pid> <-m minor> [-f force]\n"); } static int tap_cli_close(int argc, char **argv) { int c, pid, minor, force; struct timeval *timeout; pid = -1; minor = -1; force = 0; timeout = NULL; optind = 0; while ((c = getopt(argc, argv, "p:m:ft:h")) != -1) { switch (c) { case 'p': pid = atoi(optarg); break; case 'm': minor = atoi(optarg); break; case 'f': force = -1; break; case 't': timeout = tap_cli_timeout(optarg); if (!timeout) goto usage; break; case '?': goto usage; case 'h': tap_cli_close_usage(stdout); return 0; } } if (pid == -1 || minor == -1) goto usage; return tap_ctl_close(pid, minor, force, timeout); usage: tap_cli_close_usage(stderr); return EINVAL; } static void tap_cli_pause_usage(FILE *stream) { fprintf(stream, "usage: pause <-p pid> <-m minor>\n"); } static int tap_cli_pause(int argc, char **argv) { int c, pid, minor; struct timeval *timeout; pid = -1; minor = -1; timeout = NULL; optind = 0; while ((c = getopt(argc, argv, "p:m:t:h")) != -1) { switch (c) { case 'p': pid = atoi(optarg); break; case 'm': minor = atoi(optarg); break; case 't': timeout = tap_cli_timeout(optarg); if (!timeout) goto usage; case '?': goto usage; case 'h': tap_cli_pause_usage(stdout); return 0; } } if (pid == -1 || minor == -1) goto usage; return tap_ctl_pause(pid, minor, timeout); usage: tap_cli_pause_usage(stderr); return EINVAL; } static void tap_cli_unpause_usage(FILE *stream) { fprintf(stream, "usage: unpause <-p pid> <-m minor> [-a args]\n"); } int tap_cli_unpause(int argc, char **argv) { const char *args; int c, pid, minor; pid = -1; minor = -1; args = NULL; optind = 0; while ((c = getopt(argc, argv, "p:m:a:h")) != -1) { switch (c) { case 'p': pid = atoi(optarg); break; case 'm': minor = atoi(optarg); break; case 'a': args = optarg; break; case '?': goto usage; case 'h': tap_cli_unpause_usage(stdout); return 0; } } if (pid == -1 || minor == -1) goto usage; return tap_ctl_unpause(pid, minor, args); usage: tap_cli_unpause_usage(stderr); return EINVAL; } static void tap_cli_major_usage(FILE *stream) { fprintf(stream, "usage: major [-h]\n"); } static int tap_cli_major(int argc, char **argv) { int c, chr, major; chr = 0; while ((c = getopt(argc, argv, "bch")) != -1) { switch (c) { case 'b': chr = 0; break; case 'c': chr = 1; break; case '?': goto usage; case 'h': tap_cli_major_usage(stdout); return 0; default: goto usage; } } if (chr) major = -EINVAL; else major = tap_ctl_blk_major(); if (major < 0) return -major; printf("%d\n", major); return 0; usage: tap_cli_major_usage(stderr); return EINVAL; } static void tap_cli_open_usage(FILE *stream) { fprintf(stream, "usage: open <-p pid> <-m minor> <-a args> [-R readonly] " "[-e stack on existing tapdisk for the parent chain] " "[-r turn on read caching into leaf node] [-2 " "use secondary image (in mirror mode if no -s)] [-s " "fail over to the secondary image on ENOSPC]\n"); } static int tap_cli_open(int argc, char **argv) { const char *args, *secondary; int c, pid, minor, flags, prt_minor; flags = 0; pid = -1; minor = -1; prt_minor = -1; args = NULL; secondary = NULL; optind = 0; while ((c = getopt(argc, argv, "a:Rm:p:e:r2:sh")) != -1) { switch (c) { case 'p': pid = atoi(optarg); break; case 'm': minor = atoi(optarg); break; case 'a': args = optarg; break; case 'R': flags |= TAPDISK_MESSAGE_FLAG_RDONLY; break; case 'r': flags |= TAPDISK_MESSAGE_FLAG_ADD_LCACHE; break; case 'e': flags |= TAPDISK_MESSAGE_FLAG_REUSE_PRT; prt_minor = atoi(optarg); break; case '2': flags |= TAPDISK_MESSAGE_FLAG_SECONDARY; secondary = optarg; break; case 's': flags |= TAPDISK_MESSAGE_FLAG_STANDBY; break; case '?': goto usage; case 'h': tap_cli_open_usage(stdout); return 0; } } if (pid == -1 || minor == -1 || !args) goto usage; return tap_ctl_open(pid, minor, args, flags, prt_minor, secondary); usage: tap_cli_open_usage(stderr); return EINVAL; } static void tap_cli_stats_usage(FILE *stream) { fprintf(stream, "usage: stats <-p pid> <-m minor>\n"); } static int tap_cli_stats(int argc, char **argv) { pid_t pid; int c, minor, err; pid = -1; minor = -1; optind = 0; while ((c = getopt(argc, argv, "p:m:h")) != -1) { switch (c) { case 'p': pid = atoi(optarg); break; case 'm': minor = atoi(optarg); break; case '?': goto usage; case 'h': tap_cli_stats_usage(stdout); return 0; } } if (pid == -1 || minor == -1) goto usage; err = tap_ctl_stats_fwrite(pid, minor, stdout); if (err) return err; fprintf(stdout, "\n"); return 0; usage: tap_cli_stats_usage(stderr); return EINVAL; } static void tap_cli_check_usage(FILE *stream) { fprintf(stream, "usage: check\n" "(checks whether environment is suitable for tapdisk2)\n"); } static int tap_cli_check(int argc, char **argv) { int err; const char *msg; if (argc != 1) goto usage; err = tap_ctl_check(&msg); printf("%s\n", msg); return err; usage: tap_cli_check_usage(stderr); return EINVAL; } struct command commands[] = { { .name = "list", .func = tap_cli_list }, { .name = "allocate", .func = tap_cli_allocate }, { .name = "free", .func = tap_cli_free }, { .name = "create", .func = tap_cli_create }, { .name = "destroy", .func = tap_cli_destroy }, { .name = "spawn", .func = tap_cli_spawn }, { .name = "attach", .func = tap_cli_attach }, { .name = "detach", .func = tap_cli_detach }, { .name = "open", .func = tap_cli_open }, { .name = "close", .func = tap_cli_close }, { .name = "pause", .func = tap_cli_pause }, { .name = "unpause", .func = tap_cli_unpause }, { .name = "stats", .func = tap_cli_stats }, { .name = "major", .func = tap_cli_major }, { .name = "check", .func = tap_cli_check }, }; #define print_commands() \ do { \ int i, n; \ n = sizeof(commands) / sizeof(struct command); \ printf("COMMAND := { "); \ printf("%s", commands[0].name); \ for (i = 1; i < n; i++) \ printf(" | %s", commands[i].name); \ printf(" }\n"); \ } while (0) void help(void) { printf("usage: tap-ctl COMMAND [OPTIONS]\n"); print_commands(); exit(0); } struct command * get_command(char *command) { int i, n; if (strnlen(command, 25) >= 25) return NULL; n = sizeof(commands) / sizeof (struct command); for (i = 0; i < n; i++) if (!strcmp(command, commands[i].name)) return &commands[i]; return NULL; } int main(int argc, char *argv[]) { char **cargv; const char *msg; struct command *cmd; int cargc, i, cnt, ret; #ifdef CORE_DUMP #include struct rlimit rlim; rlim.rlim_cur = RLIM_INFINITY; rlim.rlim_max = RLIM_INFINITY; if (setrlimit(RLIMIT_CORE, &rlim) < 0) PERROR("setrlimit failed"); #endif signal(SIGPIPE, SIG_IGN); ret = 0; if (argc < 2) help(); cargc = argc - 1; cmd = get_command(argv[1]); if (!cmd) { EPRINTF("invalid COMMAND %s", argv[1]); help(); } ret = tap_ctl_check(&msg); if (ret) { printf("%s\n", msg); return ret; } cargv = malloc(sizeof(char *) * cargc); if (!cargv) exit(ENOMEM); cnt = 1; cargv[0] = cmd->name; for (i = 1; i < cargc; i++) { char *arg = argv[i + (argc - cargc)]; if (!strcmp(arg, "--debug")) { tap_ctl_debug = 1; continue; } cargv[cnt++] = arg; } ret = cmd->func(cnt, cargv); free(cargv); return (ret >= 0 ? ret : -ret); } blktap-2.0.90/control/tap-ctl-free.c0000644000000000000000000000407511664745551015710 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include "tap-ctl.h" #include "blktap2.h" int tap_ctl_free(const int minor) { int fd, err; fd = open(BLKTAP2_CONTROL_DEVICE, O_RDONLY); if (fd == -1) { EPRINTF("failed to open control device: %d\n", errno); return errno; } err = ioctl(fd, BLKTAP2_IOCTL_FREE_TAP, minor); close(fd); return err ? -errno : 0; } blktap-2.0.90/README0000644000000000000000000001134711664745551012461 0ustar rootrootBlktap Userspace Tools + Library ================================ Andrew Warfield and Julian Chesterfield 16th June 2006 {firstname.lastname}@cl.cam.ac.uk The blktap userspace toolkit provides a user-level disk I/O interface. The blktap mechanism involves a kernel driver that acts similarly to the existing Xen/Linux blkback driver, and a set of associated user-level libraries. Using these tools, blktap allows virtual block devices presented to VMs to be implemented in userspace and to be backed by raw partitions, files, network, etc. The key benefit of blktap is that it makes it easy and fast to write arbitrary block backends, and that these user-level backends actually perform very well. Specifically: - Metadata disk formats such as Copy-on-Write, encrypted disks, sparse formats and other compression features can be easily implemented. - Accessing file-based images from userspace avoids problems related to flushing dirty pages which are present in the Linux loopback driver. (Specifically, doing a large number of writes to an NFS-backed image don't result in the OOM killer going berserk.) - Per-disk handler processes enable easier userspace policing of block resources, and process-granularity QoS techniques (disk scheduling and related tools) may be trivially applied to block devices. - It's very easy to take advantage of userspace facilities such as networking libraries, compression utilities, peer-to-peer file-sharing systems and so on to build more complex block backends. - Crashes are contained -- incremental development/debugging is very fast. How it works (in one paragraph): Working in conjunction with the kernel blktap driver, all disk I/O requests from VMs are passed to the userspace deamon (using a shared memory interface) through a character device. Each active disk is mapped to an individual device node, allowing per-disk processes to implement individual block devices where desired. The userspace drivers are implemented using asynchronous (Linux libaio), O_DIRECT-based calls to preserve the unbuffered, batched and asynchronous request dispatch achieved with the existing blkback code. We provide a simple, asynchronous virtual disk interface that makes it quite easy to add new disk implementations. As of June 2006 the current supported disk formats are: - Raw Images (both on partitions and in image files) - File-backed Qcow disks - Standalone sparse Qcow disks - Fast shareable RAM disk between VMs (requires some form of cluster-based filesystem support e.g. OCFS2 in the guest kernel) - Some VMDK images - your mileage may vary Raw and QCow images have asynchronous backends and so should perform fairly well. VMDK is based directly on the qemu vmdk driver, which is synchronous (a.k.a. slow). Build and Installation Instructions =================================== Make to configure the blktap backend driver in your dom0 kernel. It will cooperate fine with the existing backend driver, so you can experiment with tap disks without breaking existing VM configs. To build the tools separately, "make && make install" in tools/blktap. Using the Tools =============== Prepare the image for booting. For qcow files use the qcow utilities installed earlier. e.g. qcow-create generates a blank standalone image or a file-backed CoW image. img2qcow takes an existing image or partition and creates a sparse, standalone qcow-based file. The userspace disk agent is configured to start automatically via xend (alternatively you can start it manually => 'blktapctrl') Customise the VM config file to use the 'tap' handler, followed by the driver type. e.g. for a raw image such as a file or partition: disk = ['tap:aio:,sda1,w'] e.g. for a qcow image: disk = ['tap:qcow:,sda1,w'] Mounting images in Dom0 using the blktap driver =============================================== Tap (and blkback) disks are also mountable in Dom0 without requiring an active VM to attach. You will need to build a xenlinux Dom0 kernel that includes the blkfront driver (e.g. the default 'make world' or 'make kernels' build. Simply use the xm command-line tool to activate the backend disks, and blkfront will generate a virtual block device that can be accessed in the same way as a loop device or partition: e.g. for a raw image file that would normally be mounted using the loopback driver (such as 'mount -o loop /mnt/disk'), do the following: xm block-attach 0 tap:aio: /dev/xvda1 w 0 mount /dev/xvda1 /mnt/disk <--- don't use loop driver In this way, you can use any of the userspace device-type drivers built with the blktap userspace toolkit to open and mount disks such as qcow or vmdk images: xm block-attach 0 tap:qcow: /dev/xvda1 w 0 mount /dev/xvda1 /mnt/disk blktap-2.0.90/VERSION0000644000000000000000000000000611664745551012637 0ustar rootroot2.0.90blktap-2.0.90/vhd/0000755000000000000000000000000011664745551012354 5ustar rootrootblktap-2.0.90/vhd/vhd-util.c0000644000000000000000000001074611664745551014264 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include "libvhd.h" #include "vhd-util.h" #if 1 #define DFPRINTF(_f, _a...) fprintf(stdout, _f , ##_a) #else #define DFPRINTF(_f, _a...) ((void)0) #endif typedef int (*vhd_util_func_t) (int, char **); struct command { char *name; vhd_util_func_t func; }; struct command commands[] = { { .name = "create", .func = vhd_util_create }, { .name = "snapshot", .func = vhd_util_snapshot }, { .name = "query", .func = vhd_util_query }, { .name = "read", .func = vhd_util_read }, { .name = "set", .func = vhd_util_set_field }, { .name = "repair", .func = vhd_util_repair }, { .name = "resize", .func = vhd_util_resize }, { .name = "fill", .func = vhd_util_fill }, { .name = "coalesce", .func = vhd_util_coalesce }, { .name = "modify", .func = vhd_util_modify }, { .name = "scan", .func = vhd_util_scan }, { .name = "check", .func = vhd_util_check }, { .name = "revert", .func = vhd_util_revert }, }; #define print_commands() \ do { \ int i, n; \ n = sizeof(commands) / sizeof(struct command); \ printf("COMMAND := { "); \ printf("%s", commands[0].name); \ for (i = 1; i < n; i++) \ printf(" | %s", commands[i].name); \ printf(" }\n"); \ } while (0) TEST_FAIL_EXTERN_VARS; void help(void) { printf("usage: vhd-util COMMAND [OPTIONS]\n"); print_commands(); exit(0); } struct command * get_command(char *command) { int i, n; if (strnlen(command, 25) >= 25) return NULL; n = sizeof(commands) / sizeof (struct command); for (i = 0; i < n; i++) if (!strcmp(command, commands[i].name)) return &commands[i]; return NULL; } int main(int argc, char *argv[]) { char **cargv; struct command *cmd; int cargc, i, cnt, ret; #ifdef CORE_DUMP #include struct rlimit rlim; rlim.rlim_cur = RLIM_INFINITY; rlim.rlim_max = RLIM_INFINITY; if (setrlimit(RLIMIT_CORE, &rlim) < 0) fprintf(stderr, "setrlimit failed: %d\n", errno); #endif ret = 0; if (argc < 2) help(); cargc = argc - 1; cmd = get_command(argv[1]); if (!cmd) { fprintf(stderr, "invalid COMMAND %s\n", argv[1]); help(); } cargv = malloc(sizeof(char *) * cargc); if (!cargv) exit(ENOMEM); cnt = 1; cargv[0] = cmd->name; for (i = 1; i < cargc; i++) { char *arg = argv[i + (argc - cargc)]; if (!strcmp(arg, "--debug")) { libvhd_set_log_level(1); continue; } cargv[cnt++] = arg; } #ifdef ENABLE_FAILURE_TESTING for (i = 0; i < NUM_FAIL_TESTS; i++) { TEST_FAIL[i] = 0; if (getenv(ENV_VAR_FAIL[i])) TEST_FAIL[i] = 1; } #endif // ENABLE_FAILURE_TESTING ret = cmd->func(cnt, cargv); free(cargv); return (ret >= 0 ? ret : -ret); } blktap-2.0.90/vhd/lib/0000755000000000000000000000000011664745551013122 5ustar rootrootblktap-2.0.90/vhd/lib/vhd-util-create.c0000644000000000000000000000546511664745551016275 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include "libvhd.h" int vhd_util_create(int argc, char **argv) { char *name; uint64_t size, msize; int c, sparse, err; vhd_flag_creat_t flags; err = -EINVAL; size = 0; msize = 0; sparse = 1; name = NULL; flags = 0; if (!argc || !argv) goto usage; optind = 0; while ((c = getopt(argc, argv, "n:s:S:rh")) != -1) { switch (c) { case 'n': name = optarg; break; case 's': err = 0; size = strtoull(optarg, NULL, 10); break; case 'S': err = 0; msize = strtoull(optarg, NULL, 10); break; case 'r': sparse = 0; break; case 'h': default: goto usage; } } if (err || !name || optind != argc) goto usage; if (msize && msize < size) { printf("Error: <-S size> must be greater than <-s size>\n"); return -EINVAL; } return vhd_create(name, size << 20, (sparse ? HD_TYPE_DYNAMIC : HD_TYPE_FIXED), msize << 20, flags); usage: printf("options: <-n name> <-s size (MB)> [-r reserve] [-h help] " "[<-S size (MB) for metadata preallocation " "(see vhd-util resize)>]\n"); return -EINVAL; } blktap-2.0.90/vhd/lib/test/0000755000000000000000000000000011664745551014101 5ustar rootrootblktap-2.0.90/vhd/lib/test/Makefile.am0000644000000000000000000000021011664745551016126 0ustar rootroot AM_CFLAGS = -Wall AM_CFLAGS += -Werror AM_CPPFLAGS = -D_GNU_SOURCE noinst_PROGRAMS = random-copy noinst_PROGRAMS += test-snapshot blktap-2.0.90/vhd/lib/test/test-snapshot.c0000644000000000000000000000475011664745551017067 0ustar rootroot/* * libvhdio.so supports a simple test hook for validating vhd chains: * if LIBVHD_IO_TEST is set, libvhdio will handle SIGCONT specially * by closing, snapshotting, and reopening any vhds it is tracking. * * this harness simply forks a test and stops/continues it at a given interval. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include static void usage(const char *app, int err) { printf("usage: %s <-i interval> -- \n", app); exit(err); } static void sighandler(int sig) { fprintf(stderr, "child exited\n"); exit(0); } static void stop(pid_t pid) { int status; fprintf(stderr, "stopping %d\n", pid); if (kill(pid, SIGSTOP)) { perror("stop child"); exit(1); } if (waitpid(pid, &status, WUNTRACED) == -1) { perror("waiting for child to stop"); exit(1); } if (WIFEXITED(status)) exit(0); if (!WIFSTOPPED(status)) { perror("child not stopped"); exit(1); } } static void resume(pid_t pid) { int status; fprintf(stderr, "resuming %d\n", pid); if (kill(pid, SIGCONT)) { perror("resume child"); exit(1); } if (waitpid(pid, &status, WCONTINUED) == -1) { perror("waiting for child to resume"); exit(1); } if (WIFEXITED(status)) exit(0); if (!WIFCONTINUED(status)) { perror("child not resumed"); exit(1); } } static void test(pid_t pid, int interval) { for (;;) { fprintf(stderr, "sleeping\n"); sleep(interval); stop(pid); resume(pid); } } int main(int argc, char **argv) { pid_t pid; sigset_t set; int c, interval; struct sigaction act; interval = 0; while ((c = getopt(argc, argv, "i:h")) != -1) { switch (c) { case 'i': interval = atoi(optarg); break; case 'h': usage(argv[0], 0); break; default: usage(argv[0], EINVAL); break; } } if (optind == argc || !interval) usage(argv[0], EINVAL); if (sigemptyset(&set)) { perror("init sigset"); exit(1); } act = (struct sigaction) { .sa_handler = sighandler, .sa_mask = set, .sa_flags = SA_NOCLDSTOP, }; if (sigaction(SIGCHLD, &act, NULL)) { perror("register sig handler"); exit(1); } switch ((pid = fork())) { case 0: if (putenv("LIBVHD_IO_TEST=y")) { perror("setting environment"); exit(errno); } execvp(argv[optind], &argv[optind]); perror("exec"); exit(errno); case -1: perror("fork"); exit(errno); default: test(pid, interval); break; } return 0; } blktap-2.0.90/vhd/lib/test/random-copy.c0000644000000000000000000000745411664745551016507 0ustar rootroot #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include struct range { off64_t start; off64_t end; }; struct random_copy_ctx { int sfd; int dfd; int total_chunks; struct range *chunks; }; static void usage(const char *app, int err) { printf("usage: %s \n", app); exit(err); } static int random_copy_carve_source(struct random_copy_ctx *ctx) { int err, i, n; struct stat64 st; off64_t bytes, start; err = fstat64(ctx->sfd, &st); if (err) { perror("stat source"); return errno; } n = 100; start = 0; bytes = st.st_size; ctx->chunks = calloc(n, sizeof(struct range)); if (!ctx->chunks) { printf("calloc failed\n"); return ENOMEM; } for (i = 0; start < st.st_size; i++) { int chunk; off64_t end; if (i == n) { struct range *new; n *= 2; new = realloc(ctx->chunks, n * sizeof(struct range)); if (!new) { free(ctx->chunks); ctx->chunks = NULL; printf("realloc failed\n"); return ENOMEM; } ctx->chunks = new; } chunk = (random() % (st.st_size / 10)) + 1; end = start + chunk; if (end >= st.st_size) end = st.st_size - 1; ctx->chunks[i].start = start; ctx->chunks[i].end = end; bytes -= (end - start); start = end + 1; } ctx->total_chunks = i; return 0; } static int random_copy_permute_source(struct random_copy_ctx *ctx) { int i; for (i = 0; i < ctx->total_chunks; i++) { int idx = random() % ctx->total_chunks; struct range tmp = ctx->chunks[idx]; ctx->chunks[idx] = ctx->chunks[i]; ctx->chunks[i] = tmp; } return 0; } static int random_copy_init(struct random_copy_ctx *ctx, const char *src, const char *dst) { int err; memset(ctx, 0, sizeof(*ctx)); ctx->sfd = ctx->dfd = -1; ctx->sfd = open(src, O_LARGEFILE | O_RDONLY); if (ctx->sfd == -1) { err = errno; perror("opening source"); goto fail; } ctx->dfd = open(dst, O_LARGEFILE | O_WRONLY); if (ctx->dfd == -1) { err = errno; perror("opening destination"); goto fail; } err = random_copy_carve_source(ctx); if (err) { printf("failed to carve source: %d\n", err); goto fail; } err = random_copy_permute_source(ctx); if (err) { printf("failed to permute source: %d\n", err); goto fail; } return 0; fail: close(ctx->sfd); close(ctx->dfd); memset(ctx, 0, sizeof(*ctx)); return err; } static int random_copy(struct random_copy_ctx *ctx) { char *buf; int i, err; for (i = 0; i < ctx->total_chunks; i++) { struct range *r = &ctx->chunks[i]; size_t count = r->end - r->start + 1; buf = calloc(1, count); if (!buf) { printf("calloc failed\n"); return ENOMEM; } fprintf(stderr, "copying 0x%zx from 0x%"PRIx64"\n", count, r->start); err = pread(ctx->sfd, buf, count, r->start); if (err != count) { printf("pread(0x%zx 0x%"PRIx64") returned 0x%x (%d)\n", count, r->start, err, errno); free(buf); return (errno ? : EIO); } err = pwrite(ctx->dfd, buf, count, r->start); if (err != count) { printf("pwrite(0x%zx 0x%"PRIx64") returned 0x%x (%d)\n", count, r->start, err, errno); free(buf); return (errno ? : EIO); } free(buf); } return 0; } static void random_copy_close(struct random_copy_ctx *ctx) { close(ctx->sfd); close(ctx->dfd); free(ctx->chunks); } int main(int argc, char *argv[]) { int err; char *src, *dst; struct random_copy_ctx ctx; if (argc != 3) usage(argv[0], EINVAL); src = argv[1]; dst = argv[2]; err = random_copy_init(&ctx, src, dst); if (err) { printf("failed to init: %d\n", err); exit(err); } err = random_copy(&ctx); if (err) printf("copy failed: %d\n", err); random_copy_close(&ctx); return err; } blktap-2.0.90/vhd/lib/vhd-util-snapshot.c0000644000000000000000000001233511664745551016663 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "libvhd.h" static int vhd_util_find_snapshot_target(const char *name, char **result, int *parent_raw) { int i, err; char *target; vhd_context_t vhd; *parent_raw = 0; *result = NULL; target = strdup(name); if (!target) return -ENOMEM; for (;;) { err = vhd_open(&vhd, target, VHD_OPEN_RDONLY); if (err) return err; if (vhd.footer.type != HD_TYPE_DIFF) goto out; err = vhd_get_bat(&vhd); if (err) goto out; for (i = 0; i < vhd.bat.entries; i++) if (vhd.bat.bat[i] != DD_BLK_UNUSED) goto out; free(target); err = vhd_parent_locator_get(&vhd, &target); if (err) goto out; if (vhd_parent_raw(&vhd)) { *parent_raw = 1; goto out; } vhd_close(&vhd); } out: vhd_close(&vhd); if (err) free(target); else *result = target; return err; } static int vhd_util_check_depth(const char *name, int *depth) { int err; vhd_context_t vhd; err = vhd_open(&vhd, name, VHD_OPEN_RDONLY); if (err) return err; err = vhd_chain_depth(&vhd, depth); vhd_close(&vhd); return err; } int vhd_util_snapshot(int argc, char **argv) { vhd_flag_creat_t flags; int c, err, prt_raw, limit, empty_check; char *name, *pname, *backing; char *ppath, __ppath[PATH_MAX]; uint64_t size, msize; vhd_context_t vhd; name = NULL; pname = NULL; ppath = NULL; backing = NULL; size = 0; msize = 0; flags = 0; limit = 0; empty_check = 1; if (!argc || !argv) { err = -EINVAL; goto usage; } optind = 0; while ((c = getopt(argc, argv, "n:p:S:l:meh")) != -1) { switch (c) { case 'n': name = optarg; break; case 'p': pname = optarg; break; case 'S': msize = strtoull(optarg, NULL, 10); case 'l': limit = strtol(optarg, NULL, 10); break; case 'm': vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW); break; case 'e': empty_check = 0; break; case 'h': err = 0; goto usage; default: err = -EINVAL; goto usage; } } if (!name || !pname || optind != argc) { err = -EINVAL; goto usage; } ppath = realpath(pname, __ppath); if (!ppath) return -errno; if (vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW) || !empty_check) { backing = strdup(ppath); if (!backing) { err = -ENOMEM; goto out; } } else { err = vhd_util_find_snapshot_target(ppath, &backing, &prt_raw); if (err) { backing = NULL; goto out; } /* * if the sizes of the parent chain are non-uniform, we need to * pick the right size: that of the supplied parent */ if (strcmp(ppath, backing)) { err = vhd_open(&vhd, ppath, VHD_OPEN_RDONLY); if (err) goto out; size = vhd.footer.curr_size; vhd_close(&vhd); } if (prt_raw) vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW); } if (limit && !vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) { int depth; err = vhd_util_check_depth(backing, &depth); if (err) printf("error checking snapshot depth: %d\n", err); else if (depth + 1 > limit) { err = -ENOSPC; printf("snapshot depth exceeded: " "current depth: %d, limit: %d\n", depth, limit); } if (err) goto out; } err = vhd_snapshot(name, size, backing, msize << 20, flags); out: free(backing); return err; usage: printf("options: <-n name> <-p parent name> [-l snapshot depth limit]" " [-m parent_is_raw] [-S size (MB) for metadata preallocation " "(see vhd-util resize)] [-e link to supplied parent name even " "if it's empty] [-h help]\n"); return err; } blktap-2.0.90/vhd/lib/atomicio.h0000644000000000000000000000311611664745551015100 0ustar rootroot/* $OpenBSD: atomicio.h,v 1.6 2005/05/24 17:32:43 avsm Exp $ */ /* * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Ensure all of data on socket comes through. f==read || f==vwrite */ size_t atomicio(ssize_t (*)(int, void *, size_t), int, void *, size_t); #define vwrite (ssize_t (*)(int, void *, size_t))write blktap-2.0.90/vhd/lib/vhd-util-modify.c0000644000000000000000000001027511664745551016314 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "libvhd.h" TEST_FAIL_EXTERN_VARS; static int vhd_util_zero_bat(vhd_context_t *vhd) { int err, map_bytes; uint64_t i; err = vhd_get_bat(vhd); if (err) return err; if (vhd_has_batmap(vhd)) { err = vhd_get_batmap(vhd); if (err) return err; } for (i = 0; i < vhd->bat.entries; i++) vhd->bat.bat[i] = DD_BLK_UNUSED; err = vhd_write_bat(vhd, &vhd->bat); if (err) return err; map_bytes = ((vhd->footer.curr_size >> VHD_SECTOR_SHIFT) / vhd->spb) >> 3; map_bytes = vhd_sectors_to_bytes(secs_round_up_no_zero(map_bytes)); memset(vhd->batmap.map, 0, map_bytes); return vhd_write_batmap(vhd, &vhd->batmap); } int vhd_util_modify(int argc, char **argv) { char *name; vhd_context_t vhd; int err, c, size, parent, parent_raw, kill_data; off64_t newsize = 0; char *newparent = NULL; name = NULL; size = 0; parent = 0; parent_raw = 0; kill_data = 0; optind = 0; while ((c = getopt(argc, argv, "n:s:p:mzh")) != -1) { switch (c) { case 'n': name = optarg; break; case 's': size = 1; errno = 0; newsize = strtoll(optarg, NULL, 10); if (errno) { fprintf(stderr, "Invalid size '%s'\n", optarg); goto usage; } break; case 'p': parent = 1; newparent = optarg; break; case 'm': parent_raw = 1; break; case 'z': kill_data = 1; break; case 'h': default: goto usage; } } if (!name || optind != argc) goto usage; err = vhd_open(&vhd, name, VHD_OPEN_RDWR); if (err) { printf("error opening %s: %d\n", name, err); return err; } if (kill_data) { if (vhd_type_dynamic(&vhd)) err = vhd_util_zero_bat(&vhd); else err = -ENOSYS; if (!err && !vhd.is_block) // truncate file-based VHDs err = vhd_write_footer(&vhd, &vhd.footer); if (err) printf("failed to zero VHD: %d\n", err); } if (size) { err = vhd_set_phys_size(&vhd, newsize); if (err) printf("failed to set physical size to %"PRIu64":" " %d\n", newsize, err); } if (parent) { TEST_FAIL_AT(FAIL_REPARENT_BEGIN); err = vhd_change_parent(&vhd, newparent, parent_raw); if (err) { printf("failed to set parent to '%s': %d\n", newparent, err); goto done; } TEST_FAIL_AT(FAIL_REPARENT_END); } done: vhd_close(&vhd); return err; usage: printf("*** Dangerous operations, use with care ***\n"); printf("options: <-n name> [-p NEW_PARENT set parent [-m raw]] " "[-s NEW_SIZE set size] [-z zero (kill data)] " "[-h help]\n"); return -EINVAL; } blktap-2.0.90/vhd/lib/vhd-util-fill.c0000644000000000000000000000550211664745551015750 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "libvhd.h" int vhd_util_fill(int argc, char **argv) { int err, c; char *name; void *buf; vhd_context_t vhd; uint64_t i, sec, secs; buf = NULL; name = NULL; if (!argc || !argv) goto usage; optind = 0; while ((c = getopt(argc, argv, "n:h")) != -1) { switch (c) { case 'n': name = optarg; break; case 'h': default: goto usage; } } if (!name || optind != argc) goto usage; err = vhd_open(&vhd, name, VHD_OPEN_RDWR); if (err) { printf("error opening %s: %d\n", name, err); return err; } err = vhd_get_bat(&vhd); if (err) goto done; err = posix_memalign(&buf, 4096, vhd.header.block_size); if (err) { err = -err; goto done; } sec = 0; secs = vhd.header.block_size >> VHD_SECTOR_SHIFT; for (i = 0; i < vhd.header.max_bat_size; i++) { err = vhd_io_read(&vhd, buf, sec, secs); if (err) goto done; err = vhd_io_write(&vhd, buf, sec, secs); if (err) goto done; sec += secs; } err = 0; done: free(buf); vhd_close(&vhd); return err; usage: printf("options: <-n name> [-h help]\n"); return -EINVAL; } blktap-2.0.90/vhd/lib/relative-path.c0000644000000000000000000001503011664745551016032 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "relative-path.h" #define sfree(ptr) \ do { \ free(ptr); \ ptr = NULL; \ } while (0) /* * count number of tokens between DELIMETER characters */ static int count_nodes(char *path) { int i; char *tmp; if (!path) return 0; for (i = 0, tmp = path; *tmp != '\0'; tmp++) if (*tmp == DELIMITER) i++; return i; } /* * return copy of next node in @path, or NULL * @path is moved to the end of the next node * @err is set to -errno on failure * copy should be freed */ static char * next_node(char **path, int *err) { int ret; char *tmp, *start; if (!path || !*path) { *err = -EINVAL; return NULL; } *err = 0; start = *path; for (tmp = *path; *tmp != '\0'; tmp++) if (*tmp == DELIMITER) { int size; char *node; size = tmp - start + 1; node = malloc(size); if (!node) { *err = -ENOMEM; return NULL; } ret = snprintf(node, size, "%s", start); if (ret < 0) { free(node); *err = -EINVAL; return NULL; } *path = tmp; return node; } return NULL; } /* * count number of nodes in common betwee @to and @from * returns number of common nodes, or -errno on failure */ static int count_common_nodes(char *to, char *from) { int err, common; char *to_node, *from_node; if (!to || !from) return -EINVAL; err = 0; common = 0; to_node = NULL; from_node = NULL; do { to_node = next_node(&to, &err); if (err || !to_node) break; from_node = next_node(&from, &err); if (err || !from_node) break; if (strncmp(to_node, from_node, MAX_NAME_LEN)) break; ++to; ++from; ++common; sfree(to_node); sfree(from_node); } while (1); sfree(to_node); sfree(from_node); if (err) return err; return common; } /* * construct path of @count '../', './' if @count is zero, or NULL on error * result should be freed */ static char * up_nodes(int count) { char *path, *tmp; int i, ret, len, size; if (!count) return strdup("./"); len = strlen("../"); size = len * count; if (size >= MAX_NAME_LEN) return NULL; path = malloc(size + 1); if (!path) return NULL; tmp = path; for (i = 0; i < count; i++) { ret = sprintf(tmp, "../"); if (ret < 0 || ret != len) { free(path); return NULL; } tmp += ret; } return path; } /* * return pointer to @offset'th node of path or NULL on error */ static char * node_offset(char *from, int offset) { char *path; if (!from || !offset) return NULL; for (path = from; *path != '\0'; path++) { if (*path == DELIMITER) if (--offset == 0) return path + 1; } return NULL; } /* * return a relative path from @from to @to * result should be freed */ char * relative_path_to(char *from, char *to, int *err) { int from_nodes, common; char *to_absolute, __to_absolute[PATH_MAX]; char *from_absolute, __from_absolute[PATH_MAX]; char *up, *common_target_path, *relative_path; *err = 0; up = NULL; to_absolute = NULL; from_absolute = NULL; relative_path = NULL; if (strnlen(to, MAX_NAME_LEN) == MAX_NAME_LEN || strnlen(from, MAX_NAME_LEN) == MAX_NAME_LEN) { EPRINTF("invalid input; max path length is %d\n", MAX_NAME_LEN); *err = -ENAMETOOLONG; return NULL; } to_absolute = realpath(to, __to_absolute); if (!to_absolute) { EPRINTF("failed to get absolute path of %s\n", to); *err = -errno; goto out; } from_absolute = realpath(from, __from_absolute); if (!from_absolute) { EPRINTF("failed to get absolute path of %s\n", from); *err = -errno; goto out; } if (strnlen(to_absolute, MAX_NAME_LEN) == MAX_NAME_LEN || strnlen(from_absolute, MAX_NAME_LEN) == MAX_NAME_LEN) { EPRINTF("invalid input; max path length is %d\n", MAX_NAME_LEN); *err = -ENAMETOOLONG; goto out; } /* count nodes in source path */ from_nodes = count_nodes(from_absolute); /* count nodes in common */ common = count_common_nodes(to_absolute + 1, from_absolute + 1); if (common < 0) { EPRINTF("failed to count common nodes of %s and %s: %d\n", to_absolute, from_absolute, common); *err = common; goto out; } /* move up to common node */ up = up_nodes(from_nodes - common - 1); if (!up) { EPRINTF("failed to allocate relative path for %s: %d\n", from_absolute, -ENOMEM); *err = -ENOMEM; goto out; } /* get path from common node to target */ common_target_path = node_offset(to_absolute, common + 1); if (!common_target_path) { EPRINTF("failed to find common target path to %s: %d\n", to_absolute, -EINVAL); *err = -EINVAL; goto out; } /* get relative path */ if (asprintf(&relative_path, "%s%s", up, common_target_path) == -1) { EPRINTF("failed to construct final path %s%s: %d\n", up, common_target_path, -ENOMEM); relative_path = NULL; *err = -ENOMEM; goto out; } out: sfree(up); return relative_path; } blktap-2.0.90/vhd/lib/atomicio.c0000644000000000000000000000371311664745551015076 0ustar rootroot/* * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved. * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include "atomicio.h" /* * ensure all of data on socket comes through. f==read || f==vwrite */ size_t atomicio(f, fd, _s, n) ssize_t (*f) (int, void *, size_t); int fd; void *_s; size_t n; { char *s = _s; size_t pos = 0; ssize_t res; while (n > pos) { res = (f) (fd, s + pos, n - pos); switch (res) { case -1: if (errno == EINTR || errno == EAGAIN) continue; return 0; case 0: errno = EPIPE; return pos; default: pos += (size_t)res; } } return (pos); } blktap-2.0.90/vhd/lib/vhd-util-repair.c0000644000000000000000000000466311664745551016313 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "libvhd.h" int vhd_util_repair(int argc, char **argv) { char *name; int err, c; vhd_context_t vhd; name = NULL; if (!argc || !argv) goto usage; optind = 0; while ((c = getopt(argc, argv, "n:h")) != -1) { switch (c) { case 'n': name = optarg; break; case 'h': default: goto usage; } } if (!name || optind != argc) goto usage; err = vhd_open(&vhd, name, VHD_OPEN_RDWR); if (err) { printf("error opening %s: %d\n", name, err); return err; } err = vhd_write_footer(&vhd, &vhd.footer); if (err) printf("error writing footer: %d\n", err); vhd_close(&vhd); return err; usage: printf("options: <-n name> [-h help]\n"); return -EINVAL; } blktap-2.0.90/vhd/lib/vhd-util-revert.c0000644000000000000000000000533011664745551016330 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include //#include #include //#include #include #include "libvhd.h" #include "libvhd-journal.h" int vhd_util_revert(int argc, char **argv) { char *name, *jname; vhd_journal_t journal; int c, err; name = NULL; jname = NULL; optind = 0; while ((c = getopt(argc, argv, "n:j:h")) != -1) { switch (c) { case 'n': name = optarg; break; case 'j': jname = optarg; break; case 'h': default: goto usage; } } if (!name || !jname || argc != optind) goto usage; libvhd_set_log_level(1); err = vhd_journal_open(&journal, name, jname); if (err) { printf("opening journal failed: %d\n", err); return err; } err = vhd_journal_revert(&journal); if (err) { printf("reverting journal failed: %d\n", err); vhd_journal_close(&journal); return err; } err = vhd_journal_remove(&journal); if (err) { printf("removing journal failed: %d\n", err); vhd_journal_close(&journal); return err; } return 0; usage: printf("options: <-n name> <-j journal> [-h help]\n"); return -EINVAL; } blktap-2.0.90/vhd/lib/vhd-util-set-field.c0000644000000000000000000000635511664745551016705 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "libvhd.h" int vhd_util_set_field(int argc, char **argv) { long value; int err, c; vhd_context_t vhd; char *name, *field; err = -EINVAL; value = 0; name = NULL; field = NULL; if (!argc || !argv) goto usage; optind = 0; while ((c = getopt(argc, argv, "n:f:v:h")) != -1) { switch (c) { case 'n': name = optarg; break; case 'f': field = optarg; break; case 'v': err = 0; value = strtol(optarg, NULL, 10); break; case 'h': default: goto usage; } } if (!name || !field || optind != argc || err) goto usage; if (strnlen(field, 25) >= 25) { printf("invalid field\n"); goto usage; } if (strcmp(field, "hidden") && strcmp(field, "marker")) { printf("invalid field %s\n", field); goto usage; } if (value < 0 || value > 255) { printf("invalid value %ld\n", value); goto usage; } err = vhd_open(&vhd, name, VHD_OPEN_RDWR); if (err) { printf("error opening %s: %d\n", name, err); return err; } if (!strcmp(field, "hidden")) { vhd.footer.hidden = (char)value; err = vhd_write_footer(&vhd, &vhd.footer); if (err == -ENOSPC && vhd_type_dynamic(&vhd) && value) /* if no space to write the primary footer, at least write the * backup footer so that it's possible to delete the VDI */ err = vhd_write_footer_at(&vhd, &vhd.footer, 0); } else { err = vhd_set_marker(&vhd, (char)value); } vhd_close(&vhd); return err; usage: printf("options: <-n name> <-f field> <-v value> [-h help]\n"); return -EINVAL; } blktap-2.0.90/vhd/lib/libvhdio.c0000644000000000000000000010353111664745551015071 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #ifdef _LARGEFILE_SOURCE #undef _LARGEFILE_SOURCE #endif #ifdef _LARGEFILE64_SOURCE #undef _LARGEFILE64_SOURCE #endif #if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64 #undef _FILE_OFFSET_BITS #define _FILE_OFFSET_BITS 32 #endif #ifdef _LARGEFILE_SOURCE #undef _LARGEFILE_SOURCE #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #define _FCNTL_H #include #include "libvhd.h" #include "partition.h" #define _ARRAY_SIZE(array) (sizeof(array) / sizeof(array[0])) #define __RESOLVE(func, name) \ do { \ if (!_libvhd_io_initialized) \ _libvhd_io_init(); \ if (!(func)) \ (func) = _get_std_fn((name)); \ } while (0) #define _RESOLVE(func) __RESOLVE((func), __func__) #define LIBVHD_IO_DEBUG "LIBVHD_IO_DEBUG" #define LIBVHD_IO_DUMP "LIBVHD_IO_DUMP" #define LIBVHD_IO_TEST "LIBVHD_IO_TEST" static int libvhdio_logging; static FILE *libvhdio_log; #define LOG(_f, _a...) \ do { \ if (libvhdio_logging && libvhdio_log) { \ fprintf(libvhdio_log, _f, ##_a); \ fflush(libvhdio_log); \ } \ } while (0) static int libvhdio_dump; #define DUMP(_buf, _size) \ do { \ if (libvhdio_log && libvhdio_dump) { \ int i; \ LOG("'"); \ for (i = 0; i < (_size); i++) \ fputc(((char *)(_buf))[i], \ libvhdio_log); \ LOG("'\n"); \ } \ } while (0) struct _function { const char *name; void *fn; }; struct vhd_object { vhd_context_t vhd; int refcnt; uint64_t ino; struct list_head next; }; struct vhd_partition { struct vhd_object *vhd_obj; int partition; int flags; off64_t start; /* in sectors */ off64_t end; /* in sectors */ off64_t size; /* in sectors */ }; struct vhd_fd_context { struct vhd_partition vhd_part; off64_t off; int users; }; typedef struct vhd_object vhd_object_t; typedef struct vhd_partition vhd_partition_t; typedef struct vhd_fd_context vhd_fd_context_t; typedef int (*_std_open_t)(const char *, int, int); typedef int (*_std_close_t)(int); typedef FILE *(*_std_fopen_t)(const char *, const char *); static struct _function _function_table[] = { { .name = "open", .fn = NULL }, { .name = "open64", .fn = NULL }, #ifdef __open_2 { .name = "__open_2", .fn = NULL }, #endif // __open_2 #ifdef __open64_2 { .name = "__open64_2", .fn = NULL }, #endif // __open64_2 { .name = "close", .fn = NULL }, { .name = "dup", .fn = NULL }, { .name = "dup2", .fn = NULL }, #ifdef dup3 { .name = "dup3", .fn = NULL }, #endif // dup3 { .name = "lseek", .fn = NULL }, { .name = "lseek64", .fn = NULL }, { .name = "read", .fn = NULL }, { .name = "write", .fn = NULL }, { .name = "pread", .fn = NULL }, { .name = "pread64", .fn = NULL }, { .name = "pwrite", .fn = NULL }, { .name = "pwrite64", .fn = NULL }, { .name = "fsync", .fn = NULL }, { .name = "__xstat", .fn = NULL }, { .name = "__xstat64", .fn = NULL }, { .name = "__fxstat", .fn = NULL }, { .name = "__fxstat64", .fn = NULL }, { .name = "__lxstat", .fn = NULL }, { .name = "__lxstat64", .fn = NULL }, { .name = "ioctl", .fn = NULL }, { .name = "fcntl", .fn = NULL }, { .name = "fopen", .fn = NULL }, { .name = "fopen64", .fn = NULL }, { .name = "_IO_getc", .fn = NULL }, { .name = "fread", .fn = NULL }, { .name = "posix_memalign", .fn = NULL }, }; static int _libvhd_io_interpose = 1; static struct list_head _vhd_objects; static vhd_fd_context_t **_vhd_map; static int _vhd_map_size; static int _libvhd_io_initialized; static void _libvhd_io_init(void) __attribute__((constructor)); static volatile sig_atomic_t _libvhd_io_reset_vhds; static void * _load_std_fn(const char *name) { void *fn; char *msg; LOG("loading %s\n", name); fn = dlsym(RTLD_NEXT, name); msg = dlerror(); if (!fn || msg) { LOG("dlsym '%s' failed: %s\n", name, msg); exit(1); } return fn; } static void * _get_std_fn(const char *name) { int i; for (i = 0; i < _ARRAY_SIZE(_function_table); i++) if (!strcmp(name, _function_table[i].name)) return _function_table[i].fn; return NULL; } static void _init_vhd_log(void) { int (*_std_dup)(int) = _load_std_fn("dup"); int log_fd = _std_dup(STDERR_FILENO); libvhdio_log = fdopen(log_fd, "a"); if (getenv(LIBVHD_IO_DEBUG)) { libvhdio_logging = 1; libvhd_set_log_level(1); } if (getenv(LIBVHD_IO_DUMP)) libvhdio_dump = 1; } static void _init_vhd_map(void) { _vhd_map_size = sysconf(_SC_OPEN_MAX); _vhd_map = calloc(_vhd_map_size, sizeof(vhd_fd_context_t *)); if (!_vhd_map) { LOG("failed to init vhd map\n"); exit(1); } } static void _init_vhd_objs(void) { INIT_LIST_HEAD(&_vhd_objects); } static void _libvhd_io_reset(void) { int i, err; if (!_libvhd_io_interpose) return; _libvhd_io_reset_vhds = 0; if (!_vhd_map) return; _libvhd_io_interpose = 0; for (i = 0; i < _vhd_map_size; i++) { int flags; vhd_context_t *vhd; char *child, *parent; vhd_fd_context_t *vhd_fd = _vhd_map[i]; if (!vhd_fd) continue; vhd = &vhd_fd->vhd_part.vhd_obj->vhd; flags = vhd->oflags; child = strdup(vhd->file); if (!child) exit(ENOMEM); LOG("resetting vhd fd %d user fd %d\n", vhd->fd, i); vhd_close(vhd); if (asprintf(&parent, "%s.%d.vhd", child, (int)time(NULL)) == -1) exit(ENOMEM); if (rename(child, parent)) exit(errno); err = vhd_snapshot(child, 0, parent, 0, 0); if (err) { LOG("snapshot of %s failed on reset: %d\n", child, err); exit(1); } err = vhd_open(vhd, child, flags); if (err) { LOG("opening new snapshot %s failed on reset: %d\n", child, err); exit(1); } LOG("snapshot %s %s vhd fd %d user fd %d\n", child, parent, vhd->fd, i); free(child); free(parent); } _libvhd_io_interpose = 1; } static void _libvhd_io_continue(int signo) { _libvhd_io_reset_vhds = 1; } static void _init_vhd_test(void) { if (getenv(LIBVHD_IO_TEST)) { sigset_t set; struct sigaction act; if (sigemptyset(&set)) exit(1); act = (struct sigaction) { .sa_handler = _libvhd_io_continue, .sa_mask = set, .sa_flags = 0, }; if (sigaction(SIGCONT, &act, NULL)) { LOG("failed to set signal handler: %d\n", errno); exit(1); } LOG("testing enabled\n"); } } static void _libvhd_io_init(void) { int i; if (_libvhd_io_initialized) return; _init_vhd_log(); _init_vhd_map(); _init_vhd_objs(); _init_vhd_test(); for (i = 0; i < _ARRAY_SIZE(_function_table); i++) _function_table[i].fn = _load_std_fn(_function_table[i].name); LOG("\n"); _libvhd_io_initialized = 1; } static vhd_object_t * _libvhd_io_get_vhd(const char *path, int flags) { struct stat64 st; int err, vhd_flags; vhd_object_t *tmp, *obj = NULL; _libvhd_io_interpose = 0; if (stat64(path, &st)) goto out; list_for_each_entry(tmp, &_vhd_objects, next) if (tmp->ino == st.st_ino) { obj = tmp; if (flags & (O_RDWR | O_WRONLY) && obj->vhd.oflags & VHD_OPEN_RDONLY) { errno = EACCES; obj = NULL; } goto out; } vhd_flags = VHD_OPEN_CACHED; /* * we open RDWR whenever we can since vhd objects may be shared and * we don't have a clean way to switch RDONLY vhds to RDWR. we'll * only open RDONLY when (flags & O_RDONLY) and we lack permission * to open RDWR. */ if (access(path, W_OK) == -1) { if (errno != EACCES) goto out; if (flags & (O_WRONLY | O_RDWR)) goto out; vhd_flags |= VHD_OPEN_RDONLY; } else { vhd_flags |= VHD_OPEN_RDWR; } obj = malloc(sizeof(*obj)); if (!obj) { errno = ENOMEM; goto out; } INIT_LIST_HEAD(&obj->next); obj->refcnt = 0; obj->ino = st.st_ino; err = vhd_open(&obj->vhd, path, vhd_flags); if (err) { free(obj); obj = NULL; errno = err; goto out; } list_add(&obj->next, &_vhd_objects); out: _libvhd_io_interpose = 1; if (obj) { obj->refcnt++; LOG("%s: %s 0x%"PRIx64" 0x%x\n", __func__, path, obj->ino, obj->refcnt); } return obj; } static void _libvhd_io_put_vhd(vhd_object_t *obj) { LOG("%s: 0x%"PRIx64" 0x%x\n", __func__, obj->ino, obj->refcnt - 1); if (--obj->refcnt == 0) { vhd_close(&obj->vhd); list_del(&obj->next); free(obj); } } static inline vhd_fd_context_t * _libvhd_io_map_get(int idx) { if (_libvhd_io_reset_vhds) _libvhd_io_reset(); return _vhd_map[idx]; } static inline void _libvhd_io_map_set(int idx, vhd_fd_context_t *vhd_fd) { vhd_fd->users++; _vhd_map[idx] = vhd_fd; LOG("mapping 0x%x to %s (0x%x users)\n", idx, vhd_fd->vhd_part.vhd_obj->vhd.file, vhd_fd->users); } static inline void _libvhd_io_map_clear(int idx) { vhd_fd_context_t *vhd_fd; if (idx < 0 || idx >= _vhd_map_size) return; vhd_fd = _vhd_map[idx]; _vhd_map[idx] = NULL; if (vhd_fd) { if (--vhd_fd->users == 0) { _libvhd_io_put_vhd(vhd_fd->vhd_part.vhd_obj); free(vhd_fd); } } } static int _libvhd_io_read_bytes(vhd_partition_t *vhd_part, void *buf, size_t size, uint64_t off) { int ret; vhd_context_t *vhd = &vhd_part->vhd_obj->vhd; _libvhd_io_interpose = 0; ret = vhd_io_read_bytes(vhd, buf, size, off); _libvhd_io_interpose = 1; if (ret) { LOG("vhd_io_read_bytes %s %p 0x%zx 0x%"PRIx64" failed: %d\n", vhd->file, buf, size, off, ret); errno = -ret; ret = 1; } else { LOG("vhd_io_read_bytes %s %p 0x%zx 0x%"PRIx64"\n", vhd->file, buf, size, off); DUMP(buf, size); } return ret; } static int _libvhd_io_write_bytes(vhd_partition_t *vhd_part, const void *buf, size_t size, uint64_t off) { int ret; vhd_context_t *vhd = &vhd_part->vhd_obj->vhd; _libvhd_io_interpose = 0; ret = vhd_io_write_bytes(vhd, (void *)buf, size, off); _libvhd_io_interpose = 1; if (ret) { LOG("vhd_io_write_bytes %s %p 0x%zx 0x%"PRIx64" failed: %d\n", vhd->file, buf, size, off, ret); errno = -ret; ret = 1; } else { LOG("vhd_io_write_bytes %s %p 0x%zx 0x%"PRIx64"\n", vhd->file, buf, size, off); DUMP(buf, size); } return ret; } /* * symlink pathnames like *.vhd[1-4] are treated specially */ static int _libvhd_io_guess_partition(const char *path, int *partition, int *skip) { char *sfx; int err, len; struct stat64 st; *skip = 0; *partition = 0; _libvhd_io_interpose = 0; err = lstat64(path, &st); _libvhd_io_interpose = 1; if (err == -1) return errno; if ((st.st_mode & __S_IFMT) != __S_IFLNK) { if (st.st_size < VHD_SECTOR_SIZE) *skip = 1; return 0; } sfx = strstr(path, ".vhd"); if (!sfx) return 0; sfx += strlen(".vhd"); len = strlen(sfx); if (!len) return 0; if (len > 1) return EINVAL; switch (*sfx) { case '1' ... '4': *partition = atoi(sfx); break; default: return EINVAL; } return 0; } static int _libvhd_io_init_partition(vhd_partition_t *vhd_part, int partition) { int err; vhd_context_t *vhd; void *_p; struct partition_table *pt; struct primary_partition *p; if (partition < 0 || partition > 4) return ENOENT; vhd = &vhd_part->vhd_obj->vhd; if (!partition) { vhd_part->partition = 0; vhd_part->start = 0; vhd_part->end = (vhd->footer.curr_size >> VHD_SECTOR_SHIFT); vhd_part->size = vhd_part->end; return 0; } err = posix_memalign(&_p, VHD_SECTOR_SIZE, VHD_SECTOR_SIZE); if (err) return err; pt = _p; err = _libvhd_io_read_bytes(vhd_part, pt, 512, 0); if (err) { LOG("reading partition failed: %d\n", err); goto out; } partition_table_in(pt); err = partition_table_validate(pt); if (err) { LOG("bad partition table read\n"); goto out; } p = pt->partitions + (partition - 1); if (!p->lba || !p->blocks) { err = ENOENT; goto out; } vhd_part->partition = partition; vhd_part->start = p->lba; vhd_part->end = p->lba + p->blocks; vhd_part->size = p->blocks; err = 0; LOG("%s: opening %s partition 0x%x start 0x%08"PRIx64" end 0x%08"PRIx64"\n", __func__, vhd->file, partition, vhd_part->start, vhd_part->end); out: free(pt); return err; } static int _libvhd_io_vhd_open(vhd_partition_t *vhd_part, const char *path, int flags) { int err, skip, partition; memset(vhd_part, 0, sizeof(*vhd_part)); vhd_part->flags = flags; err = _libvhd_io_guess_partition(path, &partition, &skip); if (err) return err; if (skip) return EINVAL; LOG("%s: attempting vhd_open of %s\n", __func__, path); vhd_part->vhd_obj = _libvhd_io_get_vhd(path, flags); if (!vhd_part->vhd_obj) err = errno; if (!err) { err = _libvhd_io_init_partition(vhd_part, partition); if (err) { _libvhd_io_put_vhd(vhd_part->vhd_obj); memset(vhd_part, 0, sizeof(*vhd_part)); } } return (err >= 0 ? err : -err); } static int _libvhd_io_open(const char *pathname, int flags, mode_t mode, _std_open_t _std_open) { int err, fd; vhd_fd_context_t *vhd_fd; errno = 0; vhd_fd = NULL; vhd_fd = calloc(1, sizeof(*vhd_fd)); if (!vhd_fd) { err = ENOMEM; goto fail; } err = _libvhd_io_vhd_open(&vhd_fd->vhd_part, pathname, flags); if (err) { if (err == EINVAL || err == ENOENT) goto std_open; LOG("%s: vhd_open of %s failed: %d\n", __func__, pathname, err); goto fail; } #ifdef O_CLOEXEC if (flags & (O_APPEND | O_ASYNC | O_CLOEXEC | O_DIRECTORY | O_NONBLOCK)) { #else if (flags & (O_APPEND | O_ASYNC | O_DIRECTORY | O_NONBLOCK)) { #endif //O_CLOEXEC LOG("%s: invalid flags for vhd_open: 0x%x\n", __func__, flags); err = EINVAL; goto fail; } fd = _std_open("/dev/null", O_RDONLY, 0); if (fd == -1) { err = errno; goto fail; } _libvhd_io_map_set(fd, vhd_fd); return fd; std_open: free(vhd_fd); return _std_open(pathname, flags, mode); fail: if (vhd_fd && vhd_fd->vhd_part.vhd_obj) _libvhd_io_put_vhd(vhd_fd->vhd_part.vhd_obj); free(vhd_fd); errno = err; return -1; } static int _libvhd_io_close(int fd, _std_close_t _std_close) { _libvhd_io_map_clear(fd); return _std_close(fd); } static FILE * _libvhd_io_fopen(const char *path, const char *mode) { char *m; FILE *f; int fd, flags; vhd_fd_context_t *vhd_fd; static _std_open_t _std_open64; __RESOLVE(_std_open64, "open64"); flags = 0; if (strchr(mode, 'a')) { if (strchr(mode, '+')) flags |= O_APPEND | O_RDWR; else flags |= O_APPEND | O_WRONLY; } if (strchr(mode, 'r')) { if (strchr(mode, '+')) flags |= O_RDWR; else flags |= O_RDONLY; } if (strchr(mode, 'w')) { errno = EINVAL; return NULL; } fd = _libvhd_io_open(path, flags, 0, _std_open64); if (fd == -1) return NULL; vhd_fd = _libvhd_io_map_get(fd); if (vhd_fd) m = "r"; else m = (char *)mode; f = fdopen(fd, m); if (!f) { int err = errno; close(fd); errno = err; } return f; } static ssize_t _libvhd_io_pread(vhd_partition_t *vhd_part, void *buf, size_t count, off64_t offset) { ssize_t ret; off64_t psize; ret = (ssize_t)-1; psize = vhd_part->size << VHD_SECTOR_SHIFT; if (vhd_part->flags & O_WRONLY) { errno = EPERM; goto out; } if (offset >= psize) { ret = 0; goto out; } count = MIN(count, psize - offset); offset += (vhd_part->start << VHD_SECTOR_SHIFT); if (_libvhd_io_read_bytes(vhd_part, buf, count, offset)) goto out; ret = count; out: return ret; } static ssize_t _libvhd_io_pwrite(vhd_partition_t *vhd_part, const void *buf, size_t count, off64_t offset) { ssize_t ret; off64_t psize; ret = (ssize_t)-1; psize = vhd_part->size << VHD_SECTOR_SHIFT; if (vhd_part->flags & O_RDONLY) { errno = EPERM; goto out; } if (offset >= psize) { ret = 0; goto out; } count = MIN(count, psize - offset); offset += (vhd_part->start << VHD_SECTOR_SHIFT); if (_libvhd_io_write_bytes(vhd_part, buf, count, offset)) goto out; ret = count; out: return ret; } static int _libvhd_io_fstat(int version, vhd_partition_t *vhd_part, struct stat *stats) { int ret; static int (*_std___fxstat)(int, int, struct stat *); __RESOLVE(_std___fxstat, "__fxstat"); ret = _std___fxstat(version, vhd_part->vhd_obj->vhd.fd, stats); if (ret) return ret; /* * emulate block device */ stats->st_size = 0; stats->st_blocks = 0; stats->st_blksize = getpagesize(); stats->st_mode &= ~__S_IFREG; stats->st_mode |= __S_IFBLK; return 0; } static int _libvhd_io_fstat64(int version, vhd_partition_t *vhd_part, struct stat64 *stats) { int ret; static int (*_std___fxstat64)(int, int, struct stat64 *); __RESOLVE(_std___fxstat64, "__fxstat64"); ret = _std___fxstat64(version, vhd_part->vhd_obj->vhd.fd, stats); if (ret) return ret; /* * emulate block device */ stats->st_size = 0; stats->st_blocks = 0; stats->st_blksize = getpagesize(); stats->st_mode &= ~__S_IFREG; stats->st_mode |= __S_IFBLK; return 0; } static int _libvhd_io_stat(int version, const char *path, struct stat *stats) { int err; vhd_partition_t vhd_part; err = _libvhd_io_vhd_open(&vhd_part, path, O_RDONLY); if (err) { errno = (err > 0 ? err : -err); return -1; } err = _libvhd_io_fstat(version, &vhd_part, stats); _libvhd_io_put_vhd(vhd_part.vhd_obj); return err; } static int _libvhd_io_stat64(int version, const char *path, struct stat64 *stats) { int err; vhd_partition_t vhd_part; err = _libvhd_io_vhd_open(&vhd_part, path, O_RDONLY); if (err) { errno = (err > 0 ? err : -err); return -1; } err = _libvhd_io_fstat64(version, &vhd_part, stats); _libvhd_io_put_vhd(vhd_part.vhd_obj); return err; } int open(const char *pathname, int flags, mode_t _mode) { int fd; mode_t mode; static _std_open_t _std_open; _RESOLVE(_std_open); mode = (flags & O_CREAT ? _mode : 0); if (!_libvhd_io_interpose) return _std_open(pathname, flags, mode); fd = _libvhd_io_open(pathname, flags, mode, _std_open); LOG("%s %s 0x%x 0x%x: 0x%x\n", __func__, pathname, flags, mode, fd); return fd; } int open64(const char *pathname, int flags, mode_t _mode) { int fd; mode_t mode; static _std_open_t _std_open64; _RESOLVE(_std_open64); mode = (flags & O_CREAT ? _mode : 0); if (!_libvhd_io_interpose) return _std_open64(pathname, flags, mode); fd = _libvhd_io_open(pathname, flags, mode, _std_open64); LOG("%s %s 0x%x 0x%x: 0x%x\n", __func__, pathname, flags, mode, fd); return fd; } int __open_2(const char *pathname, int flags, mode_t _mode) { int fd; mode_t mode; static _std_open_t _std___open_2; _RESOLVE(_std___open_2); mode = (flags & O_CREAT ? _mode : 0); if (!_libvhd_io_interpose) return _std___open_2(pathname, flags, mode); fd = _libvhd_io_open(pathname, flags, mode, _std___open_2); LOG("%s %s 0x%x 0x%x: 0x%x\n", __func__, pathname, flags, mode, fd); return fd; } int __open64_2(const char *pathname, int flags, mode_t _mode) { int fd; mode_t mode; static _std_open_t _std___open64_2; _RESOLVE(_std___open64_2); mode = (flags & O_CREAT ? _mode : 0); if (!_libvhd_io_interpose) return _std___open64_2(pathname, flags, mode); fd = _libvhd_io_open(pathname, flags, mode, _std___open64_2); LOG("%s %s 0x%x 0x%x: 0x%x\n", __func__, pathname, flags, mode, fd); return fd; } int close(int fd) { static _std_close_t _std_close; _RESOLVE(_std_close); LOG("%s 0x%x\n", __func__, fd); return _libvhd_io_close(fd, _std_close); } int dup(int oldfd) { int newfd; vhd_fd_context_t *vhd_fd; static int (*_std_dup)(int); _RESOLVE(_std_dup); vhd_fd = _libvhd_io_map_get(oldfd); LOG("%s 0x%x\n", __func__, oldfd); newfd = _std_dup(oldfd); if (newfd != -1 && vhd_fd) _libvhd_io_map_set(newfd, vhd_fd); return newfd; } int dup2(int oldfd, int newfd) { int ret; vhd_fd_context_t *vhd_fd; static int (*_std_dup2)(int, int); _RESOLVE(_std_dup2); vhd_fd = _libvhd_io_map_get(oldfd); LOG("%s 0x%x 0x%x\n", __func__, oldfd, newfd); ret = _std_dup2(oldfd, newfd); if (ret != -1 && vhd_fd) _libvhd_io_map_set(ret, vhd_fd); return ret; } int dup3(int oldfd, int newfd, int flags) { int ret; vhd_fd_context_t *vhd_fd; static int (*_std_dup3)(int, int, int); _RESOLVE(_std_dup3); vhd_fd = _libvhd_io_map_get(oldfd); LOG("%s 0x%x 0x%x 0x%x\n", __func__, oldfd, newfd, flags); /* * TODO: handle O_CLOEXEC... */ ret = _std_dup3(oldfd, newfd, flags); if (ret != -1 && vhd_fd) _libvhd_io_map_set(ret, vhd_fd); return ret; } off_t lseek(int fd, off_t offset, int whence) { off_t new_off; vhd_fd_context_t *vhd_fd; static off_t (*_std_lseek)(int, off_t, int); _RESOLVE(_std_lseek); vhd_fd = _libvhd_io_map_get(fd); LOG("%s 0x%x 0x%lx 0x%x\n", __func__, fd, offset, whence); if (!vhd_fd) return _std_lseek(fd, offset, whence); switch (whence) { case SEEK_SET: new_off = offset; break; case SEEK_CUR: new_off = vhd_fd->off + offset; break; case SEEK_END: new_off = (vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT) + offset; break; default: errno = EINVAL; return (off_t)-1; } if (new_off < 0 || new_off > vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT) { errno = EINVAL; return (off_t)-1; } vhd_fd->off = new_off; return vhd_fd->off; } off64_t lseek64(int fd, off64_t offset, int whence) { off64_t new_off; vhd_fd_context_t *vhd_fd; static off64_t (*_std_lseek64)(int, off64_t, int); _RESOLVE(_std_lseek64); vhd_fd = _libvhd_io_map_get(fd); LOG("%s 0x%x 0x%"PRIx64" 0x%x\n", __func__, fd, offset, whence); if (!vhd_fd) return _std_lseek64(fd, offset, whence); switch (whence) { case SEEK_SET: new_off = offset; break; case SEEK_CUR: new_off = vhd_fd->off + offset; break; case SEEK_END: new_off = (vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT) + offset; break; default: errno = EINVAL; return (off64_t)-1; } if (new_off < 0 || new_off > vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT) { errno = EINVAL; return (off64_t)-1; } vhd_fd->off = new_off; return vhd_fd->off; } ssize_t read(int fd, void *buf, size_t count) { ssize_t ret; vhd_fd_context_t *vhd_fd; static ssize_t (*_std_read)(int, void *, size_t); _RESOLVE(_std_read); vhd_fd = _libvhd_io_map_get(fd); LOG("%s 0x%x %p 0x%zx\n", __func__, fd, buf, count); if (!vhd_fd) return _std_read(fd, buf, count); ret = _libvhd_io_pread(&vhd_fd->vhd_part, buf, count, vhd_fd->off); if (ret != -1) vhd_fd->off += count; return ret; } ssize_t write(int fd, const void *buf, size_t count) { ssize_t ret; vhd_fd_context_t *vhd_fd; static ssize_t (*_std_write)(int, const void *, size_t); _RESOLVE(_std_write); vhd_fd = _libvhd_io_map_get(fd); LOG("%s 0x%x %p 0x%zx\n", __func__, fd, buf, count); if (!vhd_fd) return _std_write(fd, buf, count); ret = _libvhd_io_pwrite(&vhd_fd->vhd_part, buf, count, vhd_fd->off); if (ret != -1) vhd_fd->off += count; return ret; } ssize_t pread(int fd, void *buf, size_t count, off_t offset) { vhd_fd_context_t *vhd_fd; static ssize_t (*_std_pread)(int, void *, size_t, off_t); _RESOLVE(_std_pread); vhd_fd = _libvhd_io_map_get(fd); LOG("%s 0x%x %p 0x%zx 0x%lx\n", __func__, fd, buf, count, offset); if (!vhd_fd) return _std_pread(fd, buf, count, offset); return _libvhd_io_pread(&vhd_fd->vhd_part, buf, count, offset); } ssize_t pread64(int fd, void *buf, size_t count, off64_t offset) { vhd_fd_context_t *vhd_fd; static ssize_t (*_std_pread64)(int, void *, size_t, off64_t); _RESOLVE(_std_pread64); vhd_fd = _libvhd_io_map_get(fd); LOG("%s 0x%x %p 0x%zx 0x%"PRIx64"\n", __func__, fd, buf, count, offset); if (!vhd_fd) return _std_pread64(fd, buf, count, offset); return _libvhd_io_pread(&vhd_fd->vhd_part, buf, count, offset); } ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) { vhd_fd_context_t *vhd_fd; static ssize_t (*_std_pwrite)(int, const void *, size_t, off_t); _RESOLVE(_std_pwrite); vhd_fd = _libvhd_io_map_get(fd); LOG("%s 0x%x %p 0x%zx, 0x%lx\n", __func__, fd, buf, count, offset); if (!vhd_fd) return _std_pwrite(fd, buf, count, offset); return _libvhd_io_pwrite(&vhd_fd->vhd_part, buf, count, offset); } ssize_t pwrite64(int fd, const void *buf, size_t count, off64_t offset) { vhd_fd_context_t *vhd_fd; static ssize_t (*_std_pwrite64)(int, const void *, size_t, off64_t); _RESOLVE(_std_pwrite64); vhd_fd = _libvhd_io_map_get(fd); LOG("%s 0x%x %p 0x%zx, 0x%"PRIx64"\n", __func__, fd, buf, count, offset); if (!vhd_fd) return _std_pwrite64(fd, buf, count, offset); return _libvhd_io_pwrite(&vhd_fd->vhd_part, buf, count, offset); } int fsync(int fd) { vhd_fd_context_t *vhd_fd; static int (*_std_fsync)(int); _RESOLVE(_std_fsync); vhd_fd = _libvhd_io_map_get(fd); if (!vhd_fd) return _std_fsync(fd); LOG("%s 0x%x\n", __func__, fd); return _std_fsync(vhd_fd->vhd_part.vhd_obj->vhd.fd); } int __xstat(int version, const char *path, struct stat *buf) { int ret; static int (*_std___xstat)(int, const char *, struct stat *); _RESOLVE(_std___xstat); if (!_libvhd_io_interpose) return _std___xstat(version, path, buf); LOG("%s 0x%x %s %p\n", __func__, version, path, buf); ret = _libvhd_io_stat(version, path, buf); if (ret) ret = _std___xstat(version, path, buf); return ret; } int __xstat64(int version, const char *path, struct stat64 *buf) { int ret; static int (*_std___xstat64)(int, const char *, struct stat64 *); _RESOLVE(_std___xstat64); if (!_libvhd_io_interpose) return _std___xstat64(version, path, buf); LOG("%s 0x%x %s %p\n", __func__, version, path, buf); ret = _libvhd_io_stat64(version, path, buf); if (ret) ret = _std___xstat64(version, path, buf); return ret; } int __fxstat(int version, int fd, struct stat *buf) { vhd_fd_context_t *vhd_fd; static int (*_std___fxstat)(int, int, struct stat *); _RESOLVE(_std___fxstat); vhd_fd = _libvhd_io_map_get(fd); LOG("%s 0x%x 0x%x %p\n", __func__, version, fd, buf); if (vhd_fd) return _libvhd_io_fstat(version, &vhd_fd->vhd_part, buf); else return _std___fxstat(version, fd, buf); } int __fxstat64(int version, int fd, struct stat64 *buf) { vhd_fd_context_t *vhd_fd; static int (*_std___fxstat64)(int, int, struct stat64 *); _RESOLVE(_std___fxstat64); vhd_fd = _libvhd_io_map_get(fd); LOG("%s 0x%x 0x%x %p\n", __func__, version, fd, buf); if (vhd_fd) return _libvhd_io_fstat64(version, &vhd_fd->vhd_part, buf); else return _std___fxstat64(version, fd, buf); } /* * NB: symlinks to vhds will be stat'ed rather than lstat'ed. */ int __lxstat(int version, const char *path, struct stat *buf) { int ret; static int (*_std___lxstat)(int, const char *, struct stat *); _RESOLVE(_std___lxstat); if (!_libvhd_io_interpose) return _std___lxstat(version, path, buf); LOG("%s 0x%x %s %p\n", __func__, version, path, buf); ret = _libvhd_io_stat(version, path, buf); if (ret) ret = _std___lxstat(version, path, buf); return ret; } /* * NB: symlinks to vhds will be stat'ed rather than lstat'ed. */ int __lxstat64(int version, const char *path, struct stat64 *buf) { int ret; static int (*_std___lxstat64)(int, const char *, struct stat64 *); _RESOLVE(_std___lxstat64); if (!_libvhd_io_interpose) return _std___lxstat64(version, path, buf); LOG("%s 0x%x %s %p\n", __func__, version, path, buf); ret = _libvhd_io_stat64(version, path, buf); if (ret) ret = _std___lxstat64(version, path, buf); return ret; } int ioctl(int fd, int request, char *argp) { vhd_fd_context_t *vhd_fd; static int (*_std_ioctl)(int, int, char *); _RESOLVE(_std_ioctl); vhd_fd = _libvhd_io_map_get(fd); if (!vhd_fd) return _std_ioctl(fd, request, argp); LOG("%s 0x%x 0x%x %p\n", __func__, fd, request, argp); #ifdef BLKGETSIZE64 if (request == BLKGETSIZE64) { uint64_t *size = (uint64_t *)argp; *size = vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT; return 0; } #endif #ifdef BLKGETSIZE if (request == BLKGETSIZE) { unsigned long *size = (unsigned long *)argp; *size = vhd_fd->vhd_part.size << VHD_SECTOR_SHIFT; return 0; } #endif #ifdef BLKSSZGET if (request == BLKSSZGET) { int *sec_size = (int *)argp; *sec_size = VHD_SECTOR_SIZE; return 0; } #endif #ifdef HDIO_GETGEO if (request == HDIO_GETGEO) { vhd_context_t *vhd = &vhd_fd->vhd_part.vhd_obj->vhd; struct hd_geometry *geo = (struct hd_geometry *)argp; geo->heads = GEOM_GET_HEADS(vhd->footer.geometry); geo->sectors = GEOM_GET_SPT(vhd->footer.geometry); geo->cylinders = GEOM_GET_CYLS(vhd->footer.geometry); geo->start = vhd_fd->vhd_part.start; return 0; } #endif return _std_ioctl(fd, request, argp); } int fcntl(int fd, int cmd, ...) { int real_fd; va_list args; vhd_fd_context_t *vhd_fd; static int (*_std_fcntl)(int, int, ...); _RESOLVE(_std_fcntl); real_fd = fd; vhd_fd = _libvhd_io_map_get(fd); if (vhd_fd) real_fd = vhd_fd->vhd_part.vhd_obj->vhd.fd; LOG("%s 0x%x 0x%x\n", __func__, fd, cmd); switch (cmd) { case F_GETFD: case F_GETFL: case F_GETOWN: case F_GETSIG: case F_GETLEASE: LOG("%s 0x%x void\n", __func__, real_fd); return _std_fcntl(real_fd, cmd); case F_DUPFD: #ifdef F_DUPFD_CLOEXEC case F_DUPFD_CLOEXEC: #endif // F_DUPFD_CLOEXEC case F_SETFD: case F_SETFL: case F_SETOWN: case F_SETSIG: case F_SETLEASE: case F_NOTIFY: { long arg; va_start(args, cmd); arg = va_arg(args, long); va_end(args); LOG("%s 0x%x long 0x%lx\n", __func__, real_fd, arg); return _std_fcntl(real_fd, cmd, arg); } case F_SETLK: case F_SETLKW: case F_GETLK: { struct flock *flk; va_start(args, cmd); flk = va_arg(args, struct flock *); va_end(args); LOG("%s 0x%x lock %p\n", __func__, real_fd, flk); return _std_fcntl(real_fd, cmd, flk); } #if __WORDSIZE == 32 case F_SETLK64: case F_SETLKW64: case F_GETLK64: { struct flock64 *flk; va_start(args, cmd); flk = va_arg(args, struct flock64 *); va_end(args); LOG("%s 0x%x lock64 %p (%p)\n", __func__, real_fd, flk, _std_fcntl); return _std_fcntl(real_fd, cmd, flk); } #endif default: LOG("%s unrecognized cmd\n", __func__); errno = EINVAL; return -1; } } FILE * fopen(const char *path, const char *mode) { FILE *f; static _std_fopen_t _std_fopen; _RESOLVE(_std_fopen); if (!_libvhd_io_interpose || strchr(mode, 'w')) return _std_fopen(path, mode); f = _libvhd_io_fopen(path, mode); LOG("%s %s %s: 0x%x\n", __func__, path, mode, (f ? fileno(f) : -1)); return f; } FILE * fopen64(const char *path, const char *mode) { FILE *f; static _std_fopen_t _std_fopen64; _RESOLVE(_std_fopen64); if (!_libvhd_io_interpose || strchr(mode, 'w')) return _std_fopen64(path, mode); f = _libvhd_io_fopen(path, mode); LOG("%s %s %s: 0x%x\n", __func__, path, mode, (f ? fileno(f) : -1)); return f; } int _IO_getc(FILE *f) { int cnt; unsigned char c; vhd_fd_context_t *vhd_fd; static int (*_std__IO_getc)(FILE *); _RESOLVE(_std__IO_getc); vhd_fd = _libvhd_io_map_get(fileno(f)); if (!vhd_fd) return _std__IO_getc(f); LOG("%s %p (0x%x)\n", __func__, f, fileno(f)); cnt = _libvhd_io_pread(&vhd_fd->vhd_part, &c, sizeof(c), vhd_fd->off); if (cnt > 0) vhd_fd->off += cnt; return (int)c; } #ifdef _IO_getc_unlocked #undef _IO_getc_unlocked #endif int _IO_getc_unlocked(FILE *f) { return _IO_getc(f); } size_t fread(void *buf, size_t size, size_t n, FILE *f) { ssize_t cnt; vhd_fd_context_t *vhd_fd; static size_t (*_std_fread)(void *, size_t, size_t, FILE *); _RESOLVE(_std_fread); vhd_fd = _libvhd_io_map_get(fileno(f)); if (!vhd_fd) return _std_fread(buf, size, n, f); LOG("%s %p 0x%zx 0x%zx %p (0x%x)\n", __func__, buf, size, n, f, fileno(f)); cnt = _libvhd_io_pread(&vhd_fd->vhd_part, buf, n * size, vhd_fd->off); if (cnt > 0) { vhd_fd->off += cnt; cnt /= size; } return cnt; } #ifdef fread_unlocked #undef fread_unlocked #endif size_t fread_unlocked(void *buf, size_t size, size_t n, FILE *f) { return fread(buf, size, n, f); } /* * sigh... preloading with bash causes problems, since bash has its own * malloc(), memalign(), and free() functions, but no posix_memalign(). * this causes problems when libvhd free()'s posix_memalign()'ed memory. */ #define _libvhd_power_of_2(x) ((((x) - 1) & (x)) == 0) int posix_memalign(void **memptr, size_t alignment, size_t size) { if (!alignment || alignment % sizeof(void *) || !_libvhd_power_of_2(alignment / sizeof(void *))) return EINVAL; *memptr = memalign(alignment, size); if (!*memptr) return ENOMEM; return 0; } blktap-2.0.90/vhd/lib/Makefile.am0000644000000000000000000000266511664745551015167 0ustar rootroot SUBDIRS = . $(MAYBE_test) AM_CFLAGS = -Wall AM_CFLAGS += -Werror AM_CPPFLAGS = -D_GNU_SOURCE AM_CPPFLAGS += -I$(top_srcdir)/include AM_CPPFLAGS += -I$(top_srcdir)/lvm AM_CPPFLAGS += -I$(top_srcdir)/part lib_LTLIBRARIES = libvhd.la lib_LTLIBRARIES += $(MAYBE_libvhdio_la) libvhd_la_SOURCES = libvhd.c libvhd_la_SOURCES += libvhd-journal.c libvhd_la_SOURCES += libvhd-index.c libvhd_la_SOURCES += vhd-util-coalesce.c libvhd_la_SOURCES += vhd-util-create.c libvhd_la_SOURCES += vhd-util-fill.c libvhd_la_SOURCES += vhd-util-modify.c libvhd_la_SOURCES += vhd-util-query.c libvhd_la_SOURCES += vhd-util-read.c libvhd_la_SOURCES += vhd-util-repair.c libvhd_la_SOURCES += vhd-util-resize.c libvhd_la_SOURCES += vhd-util-revert.c libvhd_la_SOURCES += vhd-util-set-field.c libvhd_la_SOURCES += vhd-util-snapshot.c libvhd_la_SOURCES += vhd-util-scan.c libvhd_la_SOURCES += vhd-util-check.c libvhd_la_SOURCES += relative-path.c libvhd_la_SOURCES += relative-path.h libvhd_la_SOURCES += atomicio.c libvhd_la_SOURCES += atomicio.h libvhd_la_SOURCES += ../../lvm/lvm-util.c libvhd_la_LDFLAGS = -version-info 1:1:1 libvhd_la_LIBADD = -luuid $(LIBICONV) libvhdio_la_SOURCES = libvhdio.c libvhdio_la_SOURCES += ../../part/partition.c libvhdio_la_LDFLAGS = -release $(VERSION) libvhdio_la_LDFLAGS += -shared libvhdio_la_LIBADD = libvhd.la libvhdio_la_LIBADD += -ldl if ENABLE_VHDIO MAYBE_libvhdio_la = libvhdio.la endif if ENABLE_TESTS MAYBE_test = test endif blktap-2.0.90/vhd/lib/vhd-util-check.c0000644000000000000000000006522211664745551016104 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include "list.h" #include "libvhd.h" #include "vhd-util.h" // allow the VHD timestamp to be at most this many seconds into the future to // account for time skew with NFS servers #define TIMESTAMP_MAX_SLACK 1800 struct vhd_util_check_options { char ignore_footer; char ignore_parent_uuid; char ignore_timestamps; char check_data; char collect_stats; }; struct vhd_util_check_stats { char *name; char *bitmap; uint64_t secs_total; uint64_t secs_allocated; uint64_t secs_written; struct list_head next; }; struct vhd_util_check_ctx { struct vhd_util_check_options opts; struct list_head stats; int primary_footer_missing; }; #define ctx_cur_stats(ctx) \ list_entry((ctx)->stats.next, struct vhd_util_check_stats, next) static inline int test_bit_u64(volatile char *addr, uint64_t nr) { return ((addr[nr >> 3] << (nr & 7)) & 0x80) != 0; } static inline void set_bit_u64(volatile char *addr, uint64_t nr) { addr[nr >> 3] |= (0x80 >> (nr & 7)); } static void vhd_util_check_stats_init(struct vhd_util_check_ctx *ctx) { memset(&ctx->stats, 0, sizeof(ctx->stats)); INIT_LIST_HEAD(&ctx->stats); } static void vhd_util_check_stats_free_one(struct vhd_util_check_stats *stats) { if (stats) { free(stats->name); free(stats->bitmap); free(stats); } } static int vhd_util_check_stats_alloc_one(struct vhd_util_check_ctx *ctx, vhd_context_t *vhd) { int size; struct vhd_util_check_stats *stats; stats = calloc(1, sizeof(*stats)); if (!stats) goto fail; stats->name = strdup(vhd->file); if (!stats->name) goto fail; stats->secs_total = (uint64_t)vhd->spb * vhd->header.max_bat_size; size = (stats->secs_total + 7) >> 3; stats->bitmap = calloc(1, size); if (!stats->bitmap) goto fail; INIT_LIST_HEAD(&stats->next); list_add(&stats->next, &ctx->stats); return 0; fail: vhd_util_check_stats_free_one(stats); printf("failed to allocate stats for %s\n", vhd->file); return -ENOMEM; } static void vhd_util_check_stats_free(struct vhd_util_check_ctx *ctx) { struct vhd_util_check_stats *stats, *tmp; list_for_each_entry_safe(stats, tmp, &ctx->stats, next) { list_del_init(&stats->next); vhd_util_check_stats_free_one(stats); } } static inline float pct(uint64_t num, uint64_t den) { return (!den ? 0.0 : (((float)num / (float)den)) * 100.0); } static inline char * name(const char *path) { char *p = strrchr(path, '/'); if (p && (p - path) == strlen(path)) p = strrchr(--p, '/'); return (char *)(p ? ++p : path); } static void vhd_util_check_stats_print(struct vhd_util_check_ctx *ctx) { char *bitmap; uint64_t secs; struct vhd_util_check_stats *head, *cur, *prev; if (list_empty(&ctx->stats)) return; head = list_entry(ctx->stats.next, struct vhd_util_check_stats, next); printf("%s: secs allocated: 0x%"PRIx64" secs written: 0x%"PRIx64" (%.2f%%)\n", name(head->name), head->secs_allocated, head->secs_written, pct(head->secs_written, head->secs_allocated)); if (list_is_last(&head->next, &ctx->stats)) return; secs = head->secs_total; bitmap = malloc((secs + 7) >> 3); if (!bitmap) { printf("failed to allocate bitmap\n"); return; } memcpy(bitmap, head->bitmap, ((secs + 7) >> 3)); cur = prev = head; while (!list_is_last(&cur->next, &ctx->stats)) { uint64_t i, up = 0, uc = 0; cur = list_entry(cur->next.next, struct vhd_util_check_stats, next); for (i = 0; i < secs; i++) { if (test_bit_u64(cur->bitmap, i)) { if (!test_bit_u64(prev->bitmap, i)) up++; /* sector is unique wrt parent */ if (!test_bit_u64(bitmap, i)) uc++; /* sector is unique wrt chain */ set_bit_u64(bitmap, i); } } printf("%s: secs allocated: 0x%"PRIx64" secs written: 0x%"PRIx64 " (%.2f%%) secs not in parent: 0x%"PRIx64" (%.2f%%)" " secs not in ancestors: 0x%"PRIx64" (%.2f%%)\n", name(cur->name), cur->secs_allocated, cur->secs_written, pct(cur->secs_written, cur->secs_allocated), up, pct(up, cur->secs_written), uc, pct(uc, cur->secs_written)); prev = cur; } free(bitmap); } static int vhd_util_check_zeros(void *buf, size_t size) { int i; char *p; p = buf; for (i = 0; i < size; i++) if (p[i]) return i; return 0; } static char * vhd_util_check_validate_footer(struct vhd_util_check_ctx *ctx, vhd_footer_t *footer) { int size; uint32_t checksum; size = sizeof(footer->cookie); if (memcmp(footer->cookie, HD_COOKIE, size)) return "invalid cookie"; checksum = vhd_checksum_footer(footer); if (checksum != footer->checksum) { if (footer->hidden && !strncmp(footer->crtr_app, "tap", 3) && (footer->crtr_ver == VHD_VERSION(0, 1) || footer->crtr_ver == VHD_VERSION(1, 1))) { char tmp = footer->hidden; footer->hidden = 0; checksum = vhd_checksum_footer(footer); footer->hidden = tmp; if (checksum == footer->checksum) goto ok; } return "invalid checksum"; } ok: if (!(footer->features & HD_RESERVED)) return "invalid 'reserved' feature"; if (footer->features & ~(HD_TEMPORARY | HD_RESERVED)) return "invalid extra features"; if (footer->ff_version != HD_FF_VERSION) return "invalid file format version"; if (footer->type != HD_TYPE_DYNAMIC && footer->type != HD_TYPE_DIFF && footer->data_offset != ~(0ULL)) return "invalid data offset"; if (!ctx->opts.ignore_timestamps) { uint32_t now = vhd_time(time(NULL)); if (footer->timestamp > now + TIMESTAMP_MAX_SLACK) return "creation time in future"; } if (!strncmp(footer->crtr_app, "tap", 3) && footer->crtr_ver > VHD_CURRENT_VERSION) return "unsupported tap creator version"; if (vhd_chs(footer->curr_size) < footer->geometry) return "geometry too large"; if (footer->type != HD_TYPE_FIXED && footer->type != HD_TYPE_DYNAMIC && footer->type != HD_TYPE_DIFF) return "invalid type"; if (footer->saved && footer->saved != 1) return "invalid 'saved' state"; if (footer->hidden && footer->hidden != 1) return "invalid 'hidden' state"; if (vhd_util_check_zeros(footer->reserved, sizeof(footer->reserved))) return "invalid 'reserved' bits"; return NULL; } static char * vhd_util_check_validate_header(int fd, vhd_header_t *header) { off64_t eof; int i, cnt, size; uint32_t checksum; size = sizeof(header->cookie); if (memcmp(header->cookie, DD_COOKIE, size)) return "invalid cookie"; checksum = vhd_checksum_header(header); if (checksum != header->checksum) return "invalid checksum"; if (header->hdr_ver != 0x00010000) return "invalid header version"; if (header->data_offset != ~(0ULL)) return "invalid data offset"; eof = lseek64(fd, 0, SEEK_END); if (eof == (off64_t)-1) return "error finding eof"; if (header->table_offset <= 0 || header->table_offset % 512 || (header->table_offset + (header->max_bat_size * sizeof(uint32_t)) > eof - sizeof(vhd_footer_t))) return "invalid table offset"; for (cnt = 0, i = 0; i < sizeof(header->block_size) * 8; i++) if ((header->block_size >> i) & 1) cnt++; if (cnt != 1) return "invalid block size"; if (header->res1) return "invalid reserved bits"; if (vhd_util_check_zeros(header->res2, sizeof(header->res2))) return "invalid reserved bits"; return NULL; } static char * vhd_util_check_validate_differencing_header(struct vhd_util_check_ctx *ctx, vhd_context_t *vhd) { vhd_header_t *header; header = &vhd->header; if (vhd->footer.type == HD_TYPE_DIFF) { char *parent; if (!ctx->opts.ignore_timestamps) { uint32_t now = vhd_time(time(NULL)); if (header->prt_ts > now + TIMESTAMP_MAX_SLACK) return "parent creation time in future"; } if (vhd_header_decode_parent(vhd, header, &parent)) return "invalid parent name"; free(parent); } else { if (vhd_util_check_zeros(header->prt_name, sizeof(header->prt_name))) return "invalid non-null parent name"; if (vhd_util_check_zeros(header->loc, sizeof(header->loc))) return "invalid non-null parent locators"; if (!uuid_is_null(header->prt_uuid)) return "invalid non-null parent uuid"; if (header->prt_ts) return "invalid non-zero parent timestamp"; } return NULL; } static char * vhd_util_check_validate_batmap(vhd_context_t *vhd, vhd_batmap_t *batmap) { int size; off64_t eof; uint32_t checksum; size = sizeof(batmap->header.cookie); if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, size)) return "invalid cookie"; if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION) return "unsupported batmap version"; checksum = vhd_checksum_batmap(vhd, batmap); if (checksum != batmap->header.checksum) return "invalid checksum"; if (!batmap->header.batmap_size) return "invalid size zero"; if (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3) < vhd->header.max_bat_size) return "batmap-BAT size mismatch"; eof = lseek64(vhd->fd, 0, SEEK_END); if (eof == (off64_t)-1) return "error finding eof"; if (!batmap->header.batmap_offset || batmap->header.batmap_offset % 512) return "invalid batmap offset"; if ((batmap->header.batmap_offset + vhd_sectors_to_bytes(batmap->header.batmap_size)) > eof - sizeof(vhd_footer_t)) return "invalid batmap size"; return NULL; } static char * vhd_util_check_validate_parent_locator(vhd_context_t *vhd, vhd_parent_locator_t *loc) { off64_t eof; if (vhd_validate_platform_code(loc->code)) return "invalid platform code"; if (loc->code == PLAT_CODE_NONE) { if (vhd_util_check_zeros(loc, sizeof(*loc))) return "non-zero locator"; return NULL; } if (!loc->data_offset) return "invalid data offset"; if (!loc->data_space) return "invalid data space"; if (!loc->data_len) return "invalid data length"; eof = lseek64(vhd->fd, 0, SEEK_END); if (eof == (off64_t)-1) return "error finding eof"; if (loc->data_offset + vhd_parent_locator_size(loc) > eof - sizeof(vhd_footer_t)) return "invalid size"; if (loc->res) return "invalid reserved bits"; return NULL; } static char * vhd_util_check_validate_parent(struct vhd_util_check_ctx *ctx, vhd_context_t *vhd, const char *ppath) { char *msg; vhd_context_t parent; msg = NULL; if (vhd_parent_raw(vhd)) return msg; if (ctx->opts.ignore_parent_uuid) return msg; if (vhd_open(&parent, ppath, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED)) return "error opening parent"; if (uuid_compare(vhd->header.prt_uuid, parent.footer.uuid)) { msg = "invalid parent uuid"; goto out; } out: vhd_close(&parent); return msg; } static int vhd_util_check_footer(struct vhd_util_check_ctx *ctx, int fd, vhd_footer_t *footer) { int err; size_t size; char *msg; void *buf; off64_t eof, off; vhd_footer_t primary, backup; memset(&primary, 0, sizeof(primary)); memset(&backup, 0, sizeof(backup)); err = posix_memalign(&buf, VHD_SECTOR_SIZE, sizeof(primary)); if (err) { printf("error allocating buffer: %d\n", err); return -err; } memset(buf, 0, sizeof(primary)); eof = lseek64(fd, 0, SEEK_END); if (eof == (off64_t)-1) { err = -errno; printf("error calculating end of file: %d\n", err); goto out; } size = ((eof % 512) ? 511 : 512); eof = lseek64(fd, eof - size, SEEK_SET); if (eof == (off64_t)-1) { err = -errno; printf("error calculating end of file: %d\n", err); goto out; } err = read(fd, buf, 512); if (err != size) { err = (errno ? -errno : -EIO); printf("error reading primary footer: %d\n", err); goto out; } memcpy(&primary, buf, sizeof(primary)); vhd_footer_in(&primary); msg = vhd_util_check_validate_footer(ctx, &primary); if (msg) { ctx->primary_footer_missing = 1; if (ctx->opts.ignore_footer) goto check_backup; err = -EINVAL; printf("primary footer invalid: %s\n", msg); goto out; } if (primary.type == HD_TYPE_FIXED) { err = 0; goto out; } check_backup: off = lseek64(fd, 0, SEEK_SET); if (off == (off64_t)-1) { err = -errno; printf("error seeking to backup footer: %d\n", err); goto out; } size = 512; memset(buf, 0, sizeof(primary)); err = read(fd, buf, size); if (err != size) { err = (errno ? -errno : -EIO); printf("error reading backup footer: %d\n", err); goto out; } memcpy(&backup, buf, sizeof(backup)); vhd_footer_in(&backup); msg = vhd_util_check_validate_footer(ctx, &backup); if (msg) { err = -EINVAL; printf("backup footer invalid: %s\n", msg); goto out; } if (memcmp(&primary, &backup, sizeof(primary))) { if (ctx->opts.ignore_footer) { memcpy(&primary, &backup, sizeof(primary)); goto ok; } if (backup.hidden && !strncmp(backup.crtr_app, "tap", 3) && (backup.crtr_ver == VHD_VERSION(0, 1) || backup.crtr_ver == VHD_VERSION(1, 1))) { char cmp, tmp = backup.hidden; backup.hidden = 0; cmp = memcmp(&primary, &backup, sizeof(primary)); backup.hidden = tmp; if (!cmp) goto ok; } err = -EINVAL; printf("primary and backup footers do not match\n"); goto out; } ok: err = 0; memcpy(footer, &primary, sizeof(primary)); out: free(buf); return err; } static int vhd_util_check_header(int fd, vhd_footer_t *footer) { int err; off64_t off; char *msg; void *buf; vhd_header_t header; err = posix_memalign(&buf, VHD_SECTOR_SIZE, sizeof(header)); if (err) { printf("error allocating header: %d\n", err); return err; } off = footer->data_offset; off = lseek64(fd, off, SEEK_SET); if (off == (off64_t)-1) { err = -errno; printf("error seeking to header: %d\n", err); goto out; } err = read(fd, buf, sizeof(header)); if (err != sizeof(header)) { err = (errno ? -errno : -EIO); printf("error reading header: %d\n", err); goto out; } memcpy(&header, buf, sizeof(header)); vhd_header_in(&header); msg = vhd_util_check_validate_header(fd, &header); if (msg) { err = -EINVAL; printf("header is invalid: %s\n", msg); goto out; } err = 0; out: free(buf); return err; } static int vhd_util_check_differencing_header(struct vhd_util_check_ctx *ctx, vhd_context_t *vhd) { char *msg; msg = vhd_util_check_validate_differencing_header(ctx, vhd); if (msg) { printf("differencing header is invalid: %s\n", msg); return -EINVAL; } return 0; } static int vhd_util_check_bitmap(struct vhd_util_check_ctx *ctx, vhd_context_t *vhd, uint32_t block) { int err, i; uint64_t sector; char *bitmap, *data; data = NULL; bitmap = NULL; sector = (uint64_t)block * vhd->spb; err = vhd_read_bitmap(vhd, block, &bitmap); if (err) { printf("error reading bitmap 0x%x\n", block); goto out; } if (ctx->opts.check_data) { err = vhd_read_block(vhd, block, &data); if (err) { printf("error reading data block 0x%x\n", block); goto out; } } for (i = 0; i < vhd->spb; i++) { if (ctx->opts.collect_stats && vhd_bitmap_test(vhd, bitmap, i)) { ctx_cur_stats(ctx)->secs_written++; set_bit_u64(ctx_cur_stats(ctx)->bitmap, sector + i); } if (ctx->opts.check_data) { char *buf = data + (i << VHD_SECTOR_SHIFT); int set = vhd_util_check_zeros(buf, VHD_SECTOR_SIZE); int map = vhd_bitmap_test(vhd, bitmap, i); if (set && !map) { printf("sector 0x%x of block 0x%x has data " "where bitmap is clear\n", i, block); err = -EINVAL; } } } out: free(data); free(bitmap); return err; } static int vhd_util_check_bat(struct vhd_util_check_ctx *ctx, vhd_context_t *vhd) { off64_t eof, eoh; uint64_t vhd_blks; int i, j, err, block_size; if (ctx->opts.collect_stats) { err = vhd_util_check_stats_alloc_one(ctx, vhd); if (err) return err; } err = vhd_seek(vhd, 0, SEEK_END); if (err) { printf("error calculating eof: %d\n", err); return err; } eof = vhd_position(vhd); if (eof == (off64_t)-1) { printf("error calculating eof: %d\n", -errno); return -errno; } /* adjust eof for vhds with short footers */ if (eof % 512) { if (eof % 512 != 511) { printf("invalid file size: 0x%"PRIx64"\n", eof); return -EINVAL; } eof++; } err = vhd_get_bat(vhd); if (err) { printf("error reading bat: %d\n", err); return err; } err = vhd_end_of_headers(vhd, &eoh); if (err) { printf("error calculating end of metadata: %d\n", err); return err; } eof -= sizeof(vhd_footer_t); eof >>= VHD_SECTOR_SHIFT; eoh >>= VHD_SECTOR_SHIFT; block_size = vhd->spb + vhd->bm_secs; vhd_blks = vhd->footer.curr_size >> VHD_BLOCK_SHIFT; if (vhd_blks > vhd->header.max_bat_size) { printf("VHD size (%"PRIu64" blocks) exceeds BAT size (%u)\n", vhd_blks, vhd->header.max_bat_size); return -EINVAL; } for (i = 0; i < vhd_blks; i++) { uint32_t off = vhd->bat.bat[i]; if (off == DD_BLK_UNUSED) continue; if (off < eoh) { printf("block %d (offset 0x%x) clobbers headers\n", i, off); return -EINVAL; } if (off + block_size > eof) { if (!(ctx->primary_footer_missing && ctx->opts.ignore_footer && off + block_size == eof + 1)) { printf("block %d (offset 0x%x) clobbers " "footer\n", i, off); return -EINVAL; } } for (j = 0; j < vhd_blks; j++) { uint32_t joff = vhd->bat.bat[j]; if (i == j) continue; if (joff == DD_BLK_UNUSED) continue; if (off == joff) err = -EINVAL; if (off > joff && off < joff + block_size) err = -EINVAL; if (off + block_size > joff && off + block_size < joff + block_size) err = -EINVAL; if (err) { printf("block %d (offset 0x%x) clobbers " "block %d (offset 0x%x)\n", i, off, j, joff); return err; } } if (ctx->opts.check_data || ctx->opts.collect_stats) { if (ctx->opts.collect_stats) ctx_cur_stats(ctx)->secs_allocated += vhd->spb; err = vhd_util_check_bitmap(ctx, vhd, i); if (err) return err; } } return 0; } static int vhd_util_check_batmap(vhd_context_t *vhd) { char *msg; int i, err; err = vhd_get_bat(vhd); if (err) { printf("error reading bat: %d\n", err); return err; } err = vhd_get_batmap(vhd); if (err) { printf("error reading batmap: %d\n", err); return err; } msg = vhd_util_check_validate_batmap(vhd, &vhd->batmap); if (msg) { printf("batmap is invalid: %s\n", msg); return -EINVAL; } for (i = 0; i < vhd->footer.curr_size >> VHD_BLOCK_SHIFT; i++) { if (!vhd_batmap_test(vhd, &vhd->batmap, i)) continue; if (vhd->bat.bat[i] == DD_BLK_UNUSED) { printf("batmap shows unallocated block %d full\n", i); return -EINVAL; } } return 0; } static int vhd_util_check_parent_locators(struct vhd_util_check_ctx *ctx, vhd_context_t *vhd) { int i, n, err; vhd_parent_locator_t *loc; char *msg, *file, *ppath, *location, *pname; int mac, macx, w2ku, w2ru, wi2r, wi2k, found; mac = 0; macx = 0; w2ku = 0; w2ru = 0; wi2r = 0; wi2k = 0; found = 0; pname = NULL; ppath = NULL; location = NULL; err = vhd_header_decode_parent(vhd, &vhd->header, &pname); if (err) { printf("error decoding parent name: %d\n", err); return err; } n = sizeof(vhd->header.loc) / sizeof(vhd->header.loc[0]); for (i = 0; i < n; i++) { ppath = NULL; location = NULL; loc = vhd->header.loc + i; msg = vhd_util_check_validate_parent_locator(vhd, loc); if (msg) { err = -EINVAL; printf("invalid parent locator %d: %s\n", i, msg); goto out; } if (loc->code == PLAT_CODE_NONE) continue; switch (loc->code) { case PLAT_CODE_MACX: if (macx++) goto dup; break; case PLAT_CODE_MAC: if (mac++) goto dup; break; case PLAT_CODE_W2KU: if (w2ku++) goto dup; break; case PLAT_CODE_W2RU: if (w2ru++) goto dup; break; case PLAT_CODE_WI2R: if (wi2r++) goto dup; break; case PLAT_CODE_WI2K: if (wi2k++) goto dup; break; default: err = -EINVAL; printf("invalid platform code for locator %d\n", i); goto out; } if (loc->code != PLAT_CODE_MACX && loc->code != PLAT_CODE_W2RU && loc->code != PLAT_CODE_W2KU) continue; err = vhd_parent_locator_read(vhd, loc, &ppath); if (err) { printf("error reading parent locator %d: %d\n", i, err); goto out; } file = basename(ppath); if (strcmp(pname, file)) { err = -EINVAL; printf("parent locator %d name (%s) does not match " "header name (%s)\n", i, file, pname); goto out; } err = vhd_find_parent(vhd, ppath, &location); if (err) { printf("error resolving %s: %d\n", ppath, err); goto out; } err = access(location, R_OK); if (err && loc->code == PLAT_CODE_MACX) { err = -errno; printf("parent locator %d points to missing file %s " "(resolved to %s)\n", i, ppath, location); goto out; } msg = vhd_util_check_validate_parent(ctx, vhd, location); if (msg) { err = -EINVAL; printf("invalid parent %s: %s\n", location, msg); goto out; } found++; free(ppath); free(location); ppath = NULL; location = NULL; continue; dup: printf("duplicate platform code in locator %d: 0x%x\n", i, loc->code); err = -EINVAL; goto out; } if (!found) { err = -EINVAL; printf("could not find parent %s\n", pname); goto out; } err = 0; out: free(pname); free(ppath); free(location); return err; } static void vhd_util_dump_headers(const char *name) { char *argv[] = { "read", "-p", "-n", (char *)name }; int argc = sizeof(argv) / sizeof(argv[0]); printf("%s appears invalid; dumping metadata\n", name); vhd_util_read(argc, argv); } static int vhd_util_check_vhd(struct vhd_util_check_ctx *ctx, const char *name) { int fd, err; vhd_context_t vhd; struct stat stats; vhd_footer_t footer; fd = -1; memset(&vhd, 0, sizeof(vhd)); memset(&footer, 0, sizeof(footer)); err = stat(name, &stats); if (err == -1) { printf("cannot stat %s: %d\n", name, errno); return -errno; } if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) { printf("%s is not a regular file or block device\n", name); return -EINVAL; } fd = open(name, O_RDONLY | O_DIRECT | O_LARGEFILE); if (fd == -1) { printf("error opening %s\n", name); return -errno; } err = vhd_util_check_footer(ctx, fd, &footer); if (err) goto out; if (footer.type != HD_TYPE_DYNAMIC && footer.type != HD_TYPE_DIFF) goto out; err = vhd_util_check_header(fd, &footer); if (err) goto out; err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED); if (err) goto out; err = vhd_util_check_differencing_header(ctx, &vhd); if (err) goto out; err = vhd_util_check_bat(ctx, &vhd); if (err) goto out; if (vhd_has_batmap(&vhd)) { err = vhd_util_check_batmap(&vhd); if (err) goto out; } if (vhd.footer.type == HD_TYPE_DIFF) { err = vhd_util_check_parent_locators(ctx, &vhd); if (err) goto out; } err = 0; if (!ctx->opts.collect_stats) printf("%s is valid\n", name); out: if (err) vhd_util_dump_headers(name); if (fd != -1) close(fd); vhd_close(&vhd); return err; } static int vhd_util_check_parents(struct vhd_util_check_ctx *ctx, const char *name) { int err; vhd_context_t vhd; char *cur, *parent; cur = (char *)name; for (;;) { err = vhd_open(&vhd, cur, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED); if (err) goto out; if (vhd.footer.type != HD_TYPE_DIFF || vhd_parent_raw(&vhd)) { vhd_close(&vhd); goto out; } err = vhd_parent_locator_get(&vhd, &parent); vhd_close(&vhd); if (err) { printf("error getting parent: %d\n", err); goto out; } if (cur != name) free(cur); cur = parent; err = vhd_util_check_vhd(ctx, cur); if (err) goto out; } out: if (err) printf("error checking parents: %d\n", err); if (cur != name) free(cur); return err; } int vhd_util_check(int argc, char **argv) { char *name; int c, err, parents; struct vhd_util_check_ctx ctx; if (!argc || !argv) { err = -EINVAL; goto usage; } name = NULL; parents = 0; memset(&ctx, 0, sizeof(ctx)); vhd_util_check_stats_init(&ctx); optind = 0; while ((c = getopt(argc, argv, "n:iItpbsh")) != -1) { switch (c) { case 'n': name = optarg; break; case 'i': ctx.opts.ignore_footer = 1; break; case 'I': ctx.opts.ignore_parent_uuid = 1; break; case 't': ctx.opts.ignore_timestamps = 1; break; case 'p': parents = 1; break; case 'b': ctx.opts.check_data = 1; break; case 's': ctx.opts.collect_stats = 1; break; case 'h': err = 0; goto usage; default: err = -EINVAL; goto usage; } } if (!name || optind != argc) { err = -EINVAL; goto usage; } err = vhd_util_check_vhd(&ctx, name); if (err) goto out; if (parents) err = vhd_util_check_parents(&ctx, name); if (ctx.opts.collect_stats) vhd_util_check_stats_print(&ctx); vhd_util_check_stats_free(&ctx); out: return err; usage: printf("options: -n [-i ignore missing primary footers] " "[-I ignore parent uuids] [-t ignore timestamps] " "[-p check parents] [-b check bitmaps] [-s stats] [-h help]\n"); return err; } blktap-2.0.90/vhd/lib/vhd-util-read.c0000644000000000000000000004733511664745551015747 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "libvhd.h" #include "vhd-util.h" #define nsize 15 static char nbuf[nsize]; static inline char * __xconv(uint64_t num) { snprintf(nbuf, nsize, "%#" PRIx64 , num); return nbuf; } static inline char * __dconv(uint64_t num) { snprintf(nbuf, nsize, "%" PRIu64, num); return nbuf; } #define conv(hex, num) \ (hex ? __xconv((uint64_t)num) : __dconv((uint64_t)num)) static void vhd_print_header(vhd_context_t *vhd, vhd_header_t *h, int hex) { int err; uint32_t cksm; char uuid[37], time_str[26], cookie[9], *name; printf("VHD Header Summary:\n-------------------\n"); snprintf(cookie, 9, "%s", h->cookie); printf("Cookie : %s\n", cookie); printf("Data offset (unusd) : %s\n", conv(hex, h->data_offset)); printf("Table offset : %s\n", conv(hex, h->table_offset)); printf("Header version : 0x%08x\n", h->hdr_ver); printf("Max BAT size : %s\n", conv(hex, h->max_bat_size)); printf("Block size : %s ", conv(hex, h->block_size)); printf("(%s MB)\n", conv(hex, h->block_size >> 20)); err = vhd_header_decode_parent(vhd, h, &name); printf("Parent name : %s\n", (err ? "failed to read name" : name)); free(name); uuid_unparse(h->prt_uuid, uuid); printf("Parent UUID : %s\n", uuid); vhd_time_to_string(h->prt_ts, time_str); printf("Parent timestamp : %s\n", time_str); cksm = vhd_checksum_header(h); printf("Checksum : 0x%x|0x%x (%s)\n", h->checksum, cksm, h->checksum == cksm ? "Good!" : "Bad!"); printf("\n"); } /* String table for hd.type */ char *hd_type_str[7] = { "None", /* 0 */ "Reserved (deprecated)", /* 1 */ "Fixed hard disk", /* 2 */ "Dynamic hard disk", /* 3 */ "Differencing hard disk", /* 4 */ "Reserved (deprecated)", /* 5 */ "Reserved (deprecated)" /* 6 */ }; static void vhd_print_footer(vhd_footer_t *f, int hex) { uint64_t c, h, s; uint32_t ff_maj, ff_min, cr_maj, cr_min, cksm; char time_str[26], creator[5], uuid[37], cookie[9]; printf("VHD Footer Summary:\n-------------------\n"); snprintf(cookie, 9, "%s", f->cookie); printf("Cookie : %s\n", cookie); printf("Features : (0x%08x) %s%s\n", f->features, (f->features & HD_TEMPORARY) ? "" : "", (f->features & HD_RESERVED) ? "" : ""); ff_maj = f->ff_version >> 16; ff_min = f->ff_version & 0xffff; printf("File format version : Major: %d, Minor: %d\n", ff_maj, ff_min); printf("Data offset : %s\n", conv(hex, f->data_offset)); vhd_time_to_string(f->timestamp, time_str); printf("Timestamp : %s\n", time_str); memcpy(creator, f->crtr_app, 4); creator[4] = '\0'; printf("Creator Application : '%s'\n", creator); cr_maj = f->crtr_ver >> 16; cr_min = f->crtr_ver & 0xffff; printf("Creator version : Major: %d, Minor: %d\n", cr_maj, cr_min); printf("Creator OS : %s\n", ((f->crtr_os == HD_CR_OS_WINDOWS) ? "Windows" : ((f->crtr_os == HD_CR_OS_MACINTOSH) ? "Macintosh" : "Unknown!"))); printf("Original disk size : %s MB ", conv(hex, f->orig_size >> 20)); printf("(%s Bytes)\n", conv(hex, f->orig_size)); printf("Current disk size : %s MB ", conv(hex, f->curr_size >> 20)); printf("(%s Bytes)\n", conv(hex, f->curr_size)); c = f->geometry >> 16; h = (f->geometry & 0x0000FF00) >> 8; s = f->geometry & 0x000000FF; printf("Geometry : Cyl: %s, ", conv(hex, c)); printf("Hds: %s, ", conv(hex, h)); printf("Sctrs: %s\n", conv(hex, s)); printf(" : = %s MB ", conv(hex, (c * h * s) >> 11)); printf("(%s Bytes)\n", conv(hex, c * h * s << 9)); printf("Disk type : %s\n", f->type <= HD_TYPE_MAX ? hd_type_str[f->type] : "Unknown type!\n"); cksm = vhd_checksum_footer(f); printf("Checksum : 0x%x|0x%x (%s)\n", f->checksum, cksm, f->checksum == cksm ? "Good!" : "Bad!"); uuid_unparse(f->uuid, uuid); printf("UUID : %s\n", uuid); printf("Saved state : %s\n", f->saved == 0 ? "No" : "Yes"); printf("Hidden : %d\n", f->hidden); printf("\n"); } static inline char * code_name(uint32_t code) { switch(code) { case PLAT_CODE_NONE: return "PLAT_CODE_NONE"; case PLAT_CODE_WI2R: return "PLAT_CODE_WI2R"; case PLAT_CODE_WI2K: return "PLAT_CODE_WI2K"; case PLAT_CODE_W2RU: return "PLAT_CODE_W2RU"; case PLAT_CODE_W2KU: return "PLAT_CODE_W2KU"; case PLAT_CODE_MAC: return "PLAT_CODE_MAC"; case PLAT_CODE_MACX: return "PLAT_CODE_MACX"; default: return "UNKOWN"; } } static void vhd_print_parent(vhd_context_t *vhd, vhd_parent_locator_t *loc) { int err; char *buf; err = vhd_parent_locator_read(vhd, loc, &buf); if (err) { printf("failed to read parent name\n"); return; } printf(" decoded name : %s\n", buf); } static void vhd_print_parent_locators(vhd_context_t *vhd, int hex) { int i, n; vhd_parent_locator_t *loc; printf("VHD Parent Locators:\n--------------------\n"); n = sizeof(vhd->header.loc) / sizeof(struct prt_loc); for (i = 0; i < n; i++) { loc = &vhd->header.loc[i]; if (loc->code == PLAT_CODE_NONE) continue; printf("locator: : %d\n", i); printf(" code : %s\n", code_name(loc->code)); printf(" data_space : %s\n", conv(hex, loc->data_space)); printf(" data_length : %s\n", conv(hex, loc->data_len)); printf(" data_offset : %s\n", conv(hex, loc->data_offset)); vhd_print_parent(vhd, loc); printf("\n"); } } static void vhd_print_batmap_header(vhd_context_t *vhd, vhd_batmap_t *batmap, int hex) { uint32_t cksm; printf("VHD Batmap Summary:\n-------------------\n"); printf("Batmap offset : %s\n", conv(hex, batmap->header.batmap_offset)); printf("Batmap size (secs) : %s\n", conv(hex, batmap->header.batmap_size)); printf("Batmap version : 0x%08x\n", batmap->header.batmap_version); cksm = vhd_checksum_batmap(vhd, batmap); printf("Checksum : 0x%x|0x%x (%s)\n", batmap->header.checksum, cksm, (batmap->header.checksum == cksm ? "Good!" : "Bad!")); printf("\n"); } static inline int check_block_range(vhd_context_t *vhd, uint64_t block, int hex) { if (block > vhd->header.max_bat_size) { fprintf(stderr, "block %s past end of file\n", conv(hex, block)); return -ERANGE; } return 0; } static int vhd_print_headers(vhd_context_t *vhd, int hex) { int err; vhd_print_footer(&vhd->footer, hex); if (vhd_type_dynamic(vhd)) { vhd_print_header(vhd, &vhd->header, hex); if (vhd->footer.type == HD_TYPE_DIFF) vhd_print_parent_locators(vhd, hex); if (vhd_has_batmap(vhd)) { err = vhd_get_batmap(vhd); if (err) { printf("failed to get batmap header\n"); return err; } vhd_print_batmap_header(vhd, &vhd->batmap, hex); } } return 0; } static int vhd_dump_headers(const char *name, int hex) { vhd_context_t vhd; libvhd_set_log_level(1); memset(&vhd, 0, sizeof(vhd)); printf("\n%s appears invalid; dumping headers\n\n", name); vhd.fd = open(name, O_DIRECT | O_LARGEFILE | O_RDONLY); if (vhd.fd == -1) return -errno; vhd.file = strdup(name); vhd_read_footer(&vhd, &vhd.footer); vhd_read_header(&vhd, &vhd.header); vhd_print_footer(&vhd.footer, hex); vhd_print_header(&vhd, &vhd.header, hex); close(vhd.fd); free(vhd.file); return 0; } static int vhd_print_logical_to_physical(vhd_context_t *vhd, uint64_t sector, int count, int hex) { int i; uint32_t blk, lsec; uint64_t cur, offset; if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) { fprintf(stderr, "sector %s past end of file\n", conv(hex, sector + count)); return -ERANGE; } for (i = 0; i < count; i++) { cur = sector + i; blk = cur / vhd->spb; lsec = cur % vhd->spb; offset = vhd->bat.bat[blk]; if (offset != DD_BLK_UNUSED) { offset += lsec + 1; offset = vhd_sectors_to_bytes(offset); } printf("logical sector %s: ", conv(hex, cur)); printf("block number: %s, ", conv(hex, blk)); printf("sector offset: %s, ", conv(hex, lsec)); printf("file offset: %s\n", (offset == DD_BLK_UNUSED ? "not allocated" : conv(hex, offset))); } return 0; } static int vhd_print_bat(vhd_context_t *vhd, uint64_t block, int count, int hex) { int i; uint64_t cur, offset; if (check_block_range(vhd, block + count, hex)) return -ERANGE; for (i = 0; i < count; i++) { cur = block + i; offset = vhd->bat.bat[cur]; printf("block: %s: ", conv(hex, cur)); printf("offset: %s\n", (offset == DD_BLK_UNUSED ? "not allocated" : conv(hex, vhd_sectors_to_bytes(offset)))); } return 0; } static int vhd_print_bat_str(vhd_context_t *vhd) { int i, err, total_blocks, bitmap_size; char *bitmap; ssize_t n; err = 0; if (!vhd_type_dynamic(vhd)) return -EINVAL; total_blocks = vhd->footer.curr_size / vhd->header.block_size; bitmap_size = total_blocks >> 3; if (bitmap_size << 3 < total_blocks) bitmap_size++; bitmap = malloc(bitmap_size); if (!bitmap) return -ENOMEM; memset(bitmap, 0, bitmap_size); for (i = 0; i < total_blocks; i++) { if (vhd->bat.bat[i] != DD_BLK_UNUSED) set_bit(bitmap, i); } n = write(STDOUT_FILENO, bitmap, bitmap_size); if (n < 0) err = -errno; free(bitmap); return err; } static int vhd_print_bitmap(vhd_context_t *vhd, uint64_t block, int count, int hex) { char *buf; int i, err; uint64_t cur; ssize_t n; if (check_block_range(vhd, block + count, hex)) return -ERANGE; for (i = 0; i < count; i++) { cur = block + i; if (vhd->bat.bat[cur] == DD_BLK_UNUSED) { printf("block %s not allocated\n", conv(hex, cur)); continue; } err = vhd_read_bitmap(vhd, cur, &buf); if (err) goto out; n = write(STDOUT_FILENO, buf, vhd_sectors_to_bytes(vhd->bm_secs)); if (n < 0) { err = -errno; goto out; } free(buf); } err = 0; out: return err; } static int vhd_test_bitmap(vhd_context_t *vhd, uint64_t sector, int count, int hex) { char *buf; uint64_t cur; int i, err, bit; uint32_t blk, bm_blk, sec; if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) { printf("sector %s past end of file\n", conv(hex, sector)); return -ERANGE; } bm_blk = -1; buf = NULL; for (i = 0; i < count; i++) { cur = sector + i; blk = cur / vhd->spb; sec = cur % vhd->spb; if (blk != bm_blk) { bm_blk = blk; free(buf); buf = NULL; if (vhd->bat.bat[blk] != DD_BLK_UNUSED) { err = vhd_read_bitmap(vhd, blk, &buf); if (err) goto out; } } if (vhd->bat.bat[blk] == DD_BLK_UNUSED) bit = 0; else bit = vhd_bitmap_test(vhd, buf, sec); printf("block %s: ", conv(hex, blk)); printf("sec: %s: %d\n", conv(hex, sec), bit); } err = 0; out: free(buf); return err; } static int vhd_print_bitmap_extents(vhd_context_t *vhd, uint64_t sector, int count, int hex) { char *buf; uint64_t cur; int i, err, bit; uint32_t blk, bm_blk, sec; int64_t s, r; if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) { printf("sector %s past end of file\n", conv(hex, sector)); return -ERANGE; } bm_blk = -1; buf = NULL; s = -1; r = 0; for (i = 0; i < count; i++) { cur = sector + i; blk = cur / vhd->spb; sec = cur % vhd->spb; if (blk != bm_blk) { bm_blk = blk; free(buf); buf = NULL; if (vhd->bat.bat[blk] != DD_BLK_UNUSED) { err = vhd_read_bitmap(vhd, blk, &buf); if (err) goto out; } } if (vhd->bat.bat[blk] == DD_BLK_UNUSED) bit = 0; else bit = vhd_bitmap_test(vhd, buf, sec); if (bit) { if (r == 0) s = cur; r++; } else { if (r > 0) { printf("%s ", conv(hex, s)); printf("%s\n", conv(hex, r)); } r = 0; } } if (r > 0) { printf("%s ", conv(hex, s)); printf("%s\n", conv(hex, r)); } err = 0; out: free(buf); return err; } static int vhd_print_batmap(vhd_context_t *vhd) { int err, gcc; size_t size; err = vhd_get_batmap(vhd); if (err) { printf("failed to read batmap: %d\n", err); return err; } size = vhd_sectors_to_bytes(vhd->batmap.header.batmap_size); gcc = write(STDOUT_FILENO, vhd->batmap.map, size); if (gcc) ; return 0; } static int vhd_test_batmap(vhd_context_t *vhd, uint64_t block, int count, int hex) { int i, err; uint64_t cur; if (check_block_range(vhd, block + count, hex)) return -ERANGE; err = vhd_get_batmap(vhd); if (err) { fprintf(stderr, "failed to get batmap\n"); return err; } for (i = 0; i < count; i++) { cur = block + i; fprintf(stderr, "batmap for block %s: %d\n", conv(hex, cur), vhd_batmap_test(vhd, &vhd->batmap, cur)); } return 0; } static int vhd_print_data(vhd_context_t *vhd, uint64_t block, int count, int hex) { char *buf; int i, err; uint64_t cur; err = 0; if (check_block_range(vhd, block + count, hex)) return -ERANGE; for (i = 0; i < count; i++) { int gcc; cur = block + i; if (vhd->bat.bat[cur] == DD_BLK_UNUSED) { printf("block %s not allocated\n", conv(hex, cur)); continue; } err = vhd_read_block(vhd, cur, &buf); if (err) break; gcc = write(STDOUT_FILENO, buf, vhd->header.block_size); if (gcc) ; free(buf); } return err; } static int vhd_read_data(vhd_context_t *vhd, uint64_t sec, int count, int hex) { void *buf; uint64_t cur; int err, max, secs; if (vhd_sectors_to_bytes(sec + count) > vhd->footer.curr_size) return -ERANGE; max = MIN(vhd_sectors_to_bytes(count), VHD_BLOCK_SIZE); err = posix_memalign(&buf, VHD_SECTOR_SIZE, max); if (err) return -err; cur = sec; while (count) { int gcc; secs = MIN((max >> VHD_SECTOR_SHIFT), count); err = vhd_io_read(vhd, buf, cur, secs); if (err) break; gcc = write(STDOUT_FILENO, buf, vhd_sectors_to_bytes(secs)); if (gcc) ; cur += secs; count -= secs; } free(buf); return err; } static int vhd_read_bytes(vhd_context_t *vhd, uint64_t byte, int count, int hex) { void *buf; uint64_t cur; int err, max, bytes; if (byte + count > vhd->footer.curr_size) return -ERANGE; max = MIN(count, VHD_BLOCK_SIZE); err = posix_memalign(&buf, VHD_SECTOR_SIZE, max); if (err) return -err; cur = byte; while (count) { ssize_t n; bytes = MIN(max, count); err = vhd_io_read_bytes(vhd, buf, bytes, cur); if (err) break; n = write(STDOUT_FILENO, buf, bytes); if (n < 0) { err = -errno; break; } cur += bytes; count -= bytes; } free(buf); return err; } int vhd_util_read(int argc, char **argv) { char *name; vhd_context_t vhd; int c, err, headers, hex, bat_str, cache, flags; uint64_t bat, bitmap, tbitmap, ebitmap, batmap, tbatmap, data, lsec, count, read; uint64_t bread; err = 0; hex = 0; cache = 0; headers = 0; bat_str = 0; count = 1; bat = -1; bitmap = -1; tbitmap = -1; ebitmap = -1; batmap = -1; tbatmap = -1; data = -1; lsec = -1; read = -1; bread = -1; name = NULL; if (!argc || !argv) goto usage; optind = 0; while ((c = getopt(argc, argv, "n:pt:b:Bm:i:e:aj:d:c:r:R:xCh")) != -1) { switch(c) { case 'n': name = optarg; break; case 'p': headers = 1; break; case 'C': cache = 1; break; case 'B': bat_str = 1; break; case 't': lsec = strtoul(optarg, NULL, 10); break; case 'b': bat = strtoull(optarg, NULL, 10); break; case 'm': bitmap = strtoull(optarg, NULL, 10); break; case 'i': tbitmap = strtoul(optarg, NULL, 10); break; case 'e': ebitmap = strtoul(optarg, NULL, 10); break; case 'a': batmap = 1; break; case 'j': tbatmap = strtoull(optarg, NULL, 10); break; case 'd': data = strtoull(optarg, NULL, 10); break; case 'r': read = strtoull(optarg, NULL, 10); break; case 'R': bread = strtoull(optarg, NULL, 10); break; case 'c': count = strtoul(optarg, NULL, 10); break; case 'x': hex = 1; break; case 'h': default: goto usage; } } if (!name || optind != argc) goto usage; flags = VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED; if (cache) flags |= VHD_OPEN_CACHED | VHD_OPEN_FAST; err = vhd_open(&vhd, name, flags); if (err) { printf("Failed to open %s: %d\n", name, err); vhd_dump_headers(name, hex); return err; } err = vhd_get_bat(&vhd); if (err) { printf("Failed to get bat for %s: %d\n", name, err); goto out; } if (headers) vhd_print_headers(&vhd, hex); if (lsec != -1) { err = vhd_print_logical_to_physical(&vhd, lsec, count, hex); if (err) goto out; } if (bat != -1) { err = vhd_print_bat(&vhd, bat, count, hex); if (err) goto out; } if (bat_str) { err = vhd_print_bat_str(&vhd); if (err) goto out; } if (bitmap != -1) { err = vhd_print_bitmap(&vhd, bitmap, count, hex); if (err) goto out; } if (tbitmap != -1) { err = vhd_test_bitmap(&vhd, tbitmap, count, hex); if (err) goto out; } if (ebitmap != -1) { err = vhd_print_bitmap_extents(&vhd, ebitmap, count, hex); if (err) goto out; } if (batmap != -1) { err = vhd_print_batmap(&vhd); if (err) goto out; } if (tbatmap != -1) { err = vhd_test_batmap(&vhd, tbatmap, count, hex); if (err) goto out; } if (data != -1) { err = vhd_print_data(&vhd, data, count, hex); if (err) goto out; } if (read != -1) { err = vhd_read_data(&vhd, read, count, hex); if (err) goto out; } if (bread != -1) { err = vhd_read_bytes(&vhd, bread, count, hex); if (err) goto out; } err = 0; out: vhd_close(&vhd); return err; usage: printf("options:\n" "-h help\n" "-n name\n" "-p print VHD headers\n" "-t sec translate logical sector to VHD location\n" "-b blk print bat entry\n" "-B print entire bat as a bitmap\n" "-m blk print bitmap\n" "-i sec test bitmap for logical sector\n" "-e sec output extent list of allocated logical sectors\n" "-a print batmap\n" "-j blk test batmap for block\n" "-d blk print data\n" "-c num num units\n" "-r sec read num sectors at sec\n" "-R byte read num bytes at byte\n" "-x print in hex\n"); return EINVAL; } blktap-2.0.90/vhd/lib/relative-path.h0000644000000000000000000000370411664745551016044 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _RELATIVE_PATH_H_ #define _RELATIVE_PATH_H_ #include #define DELIMITER '/' #define MAX_NAME_LEN 1000 #define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a) /* * returns a relative path from @src to @dest * result should be freed */ char *relative_path_to(char *src, char *dest, int *err); #endif blktap-2.0.90/vhd/lib/vhd-util-scan.c0000644000000000000000000006714211664745551015756 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include "list.h" #include "libvhd.h" #include "lvm-util.h" #define VHD_SCAN_FAST 0x01 #define VHD_SCAN_PRETTY 0x02 #define VHD_SCAN_VOLUME 0x04 #define VHD_SCAN_NOFAIL 0x08 #define VHD_SCAN_VERBOSE 0x10 #define VHD_SCAN_PARENTS 0x20 #define VHD_SCAN_MARKERS 0x40 #define VHD_TYPE_RAW_FILE 0x01 #define VHD_TYPE_VHD_FILE 0x02 #define VHD_TYPE_RAW_VOLUME 0x04 #define VHD_TYPE_VHD_VOLUME 0x08 #define EPRINTF(_f, _a...) \ do { \ syslog(LOG_INFO, "%s: " _f, __func__, ##_a); \ } while (0) static inline int target_volume(uint8_t type) { return (type == VHD_TYPE_RAW_VOLUME || type == VHD_TYPE_VHD_VOLUME); } static inline int target_vhd(uint8_t type) { return (type == VHD_TYPE_VHD_FILE || type == VHD_TYPE_VHD_VOLUME); } struct target { char name[VHD_MAX_NAME_LEN]; char device[VHD_MAX_NAME_LEN]; uint64_t size; uint64_t start; uint64_t end; uint8_t type; }; struct iterator { int cur; int cur_size; int max_size; struct target *targets; }; struct vhd_image { char *name; char *parent; uint64_t capacity; off64_t size; uint8_t hidden; char marker; int error; char *message; struct target *target; struct list_head sibling; struct list_head children; struct vhd_image *parent_image; }; struct vhd_scan { int cur; int size; int lists_cur; int lists_size; struct vhd_image **images; struct vhd_image **lists; }; static int flags; static struct vg vg; static struct vhd_scan scan; static int vhd_util_scan_pretty_allocate_list(int cnt) { int i; memset(&scan, 0, sizeof(scan)); scan.lists_cur = 1; scan.lists_size = 10; scan.lists = calloc(scan.lists_size, sizeof(struct vhd_image *)); if (!scan.lists) goto fail; scan.lists[0] = calloc(cnt, sizeof(struct vhd_image)); if (!scan.lists[0]) goto fail; scan.images = calloc(cnt, sizeof(struct vhd_image *)); if (!scan.images) goto fail; for (i = 0; i < cnt; i++) scan.images[i] = scan.lists[0] + i; scan.cur = 0; scan.size = cnt; return 0; fail: if (scan.lists) { free(scan.lists[0]); free(scan.lists); } free(scan.images); memset(&scan, 0, sizeof(scan)); return -ENOMEM; } static void vhd_util_scan_pretty_free_list(void) { int i; if (scan.lists) { for (i = 0; i < scan.lists_cur; i++) free(scan.lists[i]); free(scan.lists); } free(scan.images); memset(&scan, 0, sizeof(scan)); } static int vhd_util_scan_pretty_add_image(struct vhd_image *image) { int i; struct vhd_image *img; for (i = 0; i < scan.cur; i++) { img = scan.images[i]; if (!strcmp(img->name, image->name)) return 0; } if (scan.cur >= scan.size) { struct vhd_image *new, **list; if (scan.lists_cur >= scan.lists_size) { list = realloc(scan.lists, scan.lists_size * 2 * sizeof(struct vhd_image *)); if (!list) return -ENOMEM; scan.lists_size *= 2; scan.lists = list; } new = calloc(scan.size, sizeof(struct vhd_image)); if (!new) return -ENOMEM; scan.lists[scan.lists_cur++] = new; scan.size *= 2; list = realloc(scan.images, scan.size * sizeof(struct vhd_image *)); if (!list) return -ENOMEM; scan.images = list; for (i = 0; i + scan.cur < scan.size; i++) scan.images[i + scan.cur] = new + i; } img = scan.images[scan.cur]; INIT_LIST_HEAD(&img->sibling); INIT_LIST_HEAD(&img->children); img->capacity = image->capacity; img->size = image->size; img->hidden = image->hidden; img->marker = image->marker; img->error = image->error; img->message = image->message; img->name = strdup(image->name); if (!img->name) goto fail; if (image->parent) { img->parent = strdup(image->parent); if (!img->parent) goto fail; } scan.cur++; return 0; fail: free(img->name); free(img->parent); memset(img, 0, sizeof(*img)); return -ENOMEM; } static int vhd_util_scan_pretty_image_compare(const void *lhs, const void *rhs) { struct vhd_image *l, *r; l = *(struct vhd_image **)lhs; r = *(struct vhd_image **)rhs; return strcmp(l->name, r->name); } static void vhd_util_scan_print_image_indent(struct vhd_image *image, int tab) { char *pad, *name, *pmsg, *parent; pad = (tab ? " " : ""); name = image->name; parent = (image->parent ? : "none"); if ((flags & VHD_SCAN_PRETTY) && image->parent && !image->parent_image) pmsg = " (not found in scan)"; else pmsg = ""; if (!(flags & VHD_SCAN_VERBOSE)) { name = basename(image->name); if (image->parent) parent = basename(image->parent); } if (image->error) printf("%*svhd=%s scan-error=%d error-message='%s'\n", tab, pad, image->name, image->error, image->message); else if (!(flags & VHD_SCAN_MARKERS)) printf("%*svhd=%s capacity=%"PRIu64" size=%"PRIu64" hidden=%u " "parent=%s%s\n", tab, pad, name, image->capacity, image->size, image->hidden, parent, pmsg); else printf("%*svhd=%s capacity=%"PRIu64" size=%"PRIu64" hidden=%u " "marker=%u parent=%s%s\n", tab, pad, name, image->capacity, image->size, image->hidden, (uint8_t)image->marker, parent, pmsg); } static void vhd_util_scan_pretty_print_tree(struct vhd_image *image, int depth) { struct vhd_image *img, *tmp; vhd_util_scan_print_image_indent(image, depth * 3); list_for_each_entry_safe(img, tmp, &image->children, sibling) if (!img->hidden) vhd_util_scan_pretty_print_tree(img, depth + 1); list_for_each_entry_safe(img, tmp, &image->children, sibling) if (img->hidden) vhd_util_scan_pretty_print_tree(img, depth + 1); free(image->name); free(image->parent); image->name = NULL; image->parent = NULL; } static void vhd_util_scan_pretty_print_images(void) { int i; struct vhd_image *image, **parentp, *parent, *keyp, key; qsort(scan.images, scan.cur, sizeof(scan.images[0]), vhd_util_scan_pretty_image_compare); for (i = 0; i < scan.cur; i++) { image = scan.images[i]; if (!image->parent) { image->parent_image = NULL; continue; } memset(&key, 0, sizeof(key)); key.name = image->parent; keyp = &key; parentp = bsearch(&keyp, scan.images, scan.cur, sizeof(scan.images[0]), vhd_util_scan_pretty_image_compare); if (!parentp) { image->parent_image = NULL; continue; } parent = *parentp; image->parent_image = parent; list_add_tail(&image->sibling, &parent->children); } for (i = 0; i < scan.cur; i++) { image = scan.images[i]; if (image->parent_image || !image->hidden) continue; vhd_util_scan_pretty_print_tree(image, 0); } for (i = 0; i < scan.cur; i++) { image = scan.images[i]; if (!image->name || image->parent_image) continue; vhd_util_scan_pretty_print_tree(image, 0); } for (i = 0; i < scan.cur; i++) { image = scan.images[i]; if (!image->name) continue; vhd_util_scan_pretty_print_tree(image, 0); } } static void vhd_util_scan_print_image(struct vhd_image *image) { int err; if (!image->error && (flags & VHD_SCAN_PRETTY)) { err = vhd_util_scan_pretty_add_image(image); if (!err) return; if (!image->error) { image->error = err; image->message = "allocating memory"; } } vhd_util_scan_print_image_indent(image, 0); } static int vhd_util_scan_error(const char *file, int err) { struct vhd_image image; memset(&image, 0, sizeof(image)); image.name = (char *)file; image.error = err; image.message = "failure scanning target"; vhd_util_scan_print_image(&image); /* if (flags & VHD_SCAN_NOFAIL) return 0; */ return err; } static vhd_parent_locator_t * vhd_util_scan_get_parent_locator(vhd_context_t *vhd) { int i; vhd_parent_locator_t *loc; loc = NULL; for (i = 0; i < 8; i++) { if (vhd->header.loc[i].code == PLAT_CODE_MACX) { loc = vhd->header.loc + i; break; } if (vhd->header.loc[i].code == PLAT_CODE_W2RU) loc = vhd->header.loc + i; if (!loc && vhd->header.loc[i].code != PLAT_CODE_NONE) loc = vhd->header.loc + i; } return loc; } static inline int copy_name(char *dst, const char *src) { if (snprintf(dst, VHD_MAX_NAME_LEN, "%s", src) < VHD_MAX_NAME_LEN) return 0; return -ENAMETOOLONG; } /* * LVHD stores realpath(parent) in parent locators, so * /dev// becomes /dev/mapper/- */ static int vhd_util_scan_extract_volume_name(char *dst, const char *src) { char copy[VHD_MAX_NAME_LEN], *name, *s, *c; name = strrchr(src, '/'); if (!name) name = (char *)src; /* convert single dashes to slashes, double dashes to single dashes */ for (c = copy, s = name; *s != '\0'; s++, c++) { if (*s == '-') { if (s[1] != '-') *c = '/'; else { s++; *c = '-'; } } else *c = *s; } *c = '\0'; c = strrchr(copy, '/'); if (c == name) { /* unrecognized format */ strcpy(dst, src); return -EINVAL; } strcpy(dst, ++c); return 0; } static int vhd_util_scan_get_volume_parent(vhd_context_t *vhd, struct vhd_image *image) { int err; char name[VHD_MAX_NAME_LEN]; vhd_parent_locator_t *loc, copy; if (flags & VHD_SCAN_FAST) { err = vhd_header_decode_parent(vhd, &vhd->header, &image->parent); if (!err) goto found; } loc = vhd_util_scan_get_parent_locator(vhd); if (!loc) return -EINVAL; copy = *loc; copy.data_offset += image->target->start; err = vhd_parent_locator_read(vhd, ©, &image->parent); if (err) return err; found: err = vhd_util_scan_extract_volume_name(name, image->parent); if (!err) return copy_name(image->parent, name); return 0; } static int vhd_util_scan_get_parent(vhd_context_t *vhd, struct vhd_image *image) { int err; vhd_parent_locator_t *loc; if (!target_vhd(image->target->type)) { image->parent = NULL; return 0; } loc = NULL; if (target_volume(image->target->type)) return vhd_util_scan_get_volume_parent(vhd, image); if (flags & VHD_SCAN_FAST) { err = vhd_header_decode_parent(vhd, &vhd->header, &image->parent); if (!err) return 0; } else { /* * vhd_parent_locator_get checks for the existence of the * parent file. if this call succeeds, all is well; if not, * we'll try to return whatever string we have before failing * outright. */ err = vhd_parent_locator_get(vhd, &image->parent); if (!err) return 0; } loc = vhd_util_scan_get_parent_locator(vhd); if (!loc) return -EINVAL; return vhd_parent_locator_read(vhd, loc, &image->parent); } static int vhd_util_scan_get_hidden(vhd_context_t *vhd, struct vhd_image *image) { int err, hidden; err = 0; hidden = 0; if (target_vhd(image->target->type)) err = vhd_hidden(vhd, &hidden); else hidden = 1; if (err) return err; image->hidden = hidden; return 0; } static int vhd_util_scan_get_marker(vhd_context_t *vhd, struct vhd_image *image) { int err; char marker; err = 0; marker = 0; if (target_vhd(image->target->type) && vhd_has_batmap(vhd)) err = vhd_marker(vhd, &marker); image->marker = marker; return err; } static int vhd_util_scan_get_size(vhd_context_t *vhd, struct vhd_image *image) { image->size = image->target->size; if (target_vhd(image->target->type)) image->capacity = vhd->footer.curr_size; else image->capacity = image->size; return 0; } static int vhd_util_scan_open_file(vhd_context_t *vhd, struct vhd_image *image) { int err, vhd_flags; if (!target_vhd(image->target->type)) return 0; vhd_flags = VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED; if (flags & VHD_SCAN_FAST) vhd_flags |= VHD_OPEN_FAST; err = vhd_open(vhd, image->name, vhd_flags); if (err) { vhd->file = NULL; image->message = "opening file"; image->error = err; return image->error; } return 0; } static int vhd_util_scan_read_volume_headers(vhd_context_t *vhd, struct vhd_image *image) { int err; void *buf; size_t size; struct target *target; buf = NULL; target = image->target; size = sizeof(vhd_footer_t) + sizeof(vhd_header_t); err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) { buf = NULL; image->message = "allocating image"; image->error = -err; goto out; } err = vhd_seek(vhd, target->start, SEEK_SET); if (err) { image->message = "seeking to headers"; image->error = err; goto out; } err = vhd_read(vhd, buf, size); if (err) { image->message = "reading headers"; image->error = err; goto out; } memcpy(&vhd->footer, buf, sizeof(vhd_footer_t)); vhd_footer_in(&vhd->footer); err = vhd_validate_footer(&vhd->footer); if (err) { image->message = "invalid footer"; image->error = err; goto out; } /* lvhd vhds should always be dynamic */ if (vhd_type_dynamic(vhd)) { if (vhd->footer.data_offset != sizeof(vhd_footer_t)) err = vhd_read_header_at(vhd, &vhd->header, vhd->footer.data_offset + target->start); else { memcpy(&vhd->header, buf + sizeof(vhd_footer_t), sizeof(vhd_header_t)); vhd_header_in(&vhd->header); err = vhd_validate_header(&vhd->header); } if (err) { image->message = "reading header"; image->error = err; goto out; } vhd->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT; vhd->bm_secs = secs_round_up_no_zero(vhd->spb >> 3); } out: free(buf); return image->error; } static int vhd_util_scan_open_volume(vhd_context_t *vhd, struct vhd_image *image) { struct target *target; target = image->target; memset(vhd, 0, sizeof(*vhd)); vhd->oflags = VHD_OPEN_RDONLY | VHD_OPEN_FAST; if (target->end - target->start < 4096) { image->message = "device too small"; image->error = -EINVAL; return image->error; } vhd->file = strdup(image->name); if (!vhd->file) { image->message = "allocating device"; image->error = -ENOMEM; return image->error; } vhd->fd = open(target->device, O_RDONLY | O_DIRECT | O_LARGEFILE); if (vhd->fd == -1) { free(vhd->file); vhd->file = NULL; image->message = "opening device"; image->error = -errno; return image->error; } if (target_vhd(target->type)) return vhd_util_scan_read_volume_headers(vhd, image); return 0; } static int vhd_util_scan_open(vhd_context_t *vhd, struct vhd_image *image) { struct target *target; target = image->target; if (target_volume(image->target->type) || !(flags & VHD_SCAN_PRETTY)) image->name = target->name; else { char __image_name[PATH_MAX]; image->name = realpath(target->name, __image_name); if (image->name) image->name = strdup(__image_name); if (!image->name) { image->name = target->name; image->message = "resolving name"; image->error = -errno; return image->error; } } if (target_volume(target->type)) return vhd_util_scan_open_volume(vhd, image); else return vhd_util_scan_open_file(vhd, image); } static int vhd_util_scan_init_file_target(struct target *target, const char *file, uint8_t type) { int err; struct stat stats; err = stat(file, &stats); if (err == -1) return -errno; err = copy_name(target->name, file); if (err) return err; err = copy_name(target->device, file); if (err) return err; target->type = type; target->start = 0; target->size = stats.st_size; target->end = stats.st_size; return 0; } static int vhd_util_scan_init_volume_target(struct target *target, struct lv *lv, uint8_t type) { int err; if (lv->first_segment.type != LVM_SEG_TYPE_LINEAR) return -ENOSYS; err = copy_name(target->name, lv->name); if (err) { EPRINTF("copy target name failed: '%s'\n", lv->name); return err; } err = copy_name(target->device, lv->first_segment.device); if (err) { EPRINTF("copy target device failed: '%s'\n", lv->first_segment.device); return err; } target->type = type; target->size = lv->size; target->start = lv->first_segment.pe_start; target->end = target->start + lv->first_segment.pe_size; return 0; } static int iterator_init(struct iterator *itr, int cnt, struct target *targets) { memset(itr, 0, sizeof(*itr)); itr->targets = malloc(sizeof(struct target) * cnt); if (!itr->targets) return -ENOMEM; memcpy(itr->targets, targets, sizeof(struct target) * cnt); itr->cur = 0; itr->cur_size = cnt; itr->max_size = cnt; return 0; } static struct target * iterator_next(struct iterator *itr) { if (itr->cur == itr->cur_size) return NULL; return itr->targets + itr->cur++; } static int iterator_add_file(struct iterator *itr, struct target *target, const char *parent, uint8_t type) { int i; struct target *t; char *lname, *rname; for (i = 0; i < itr->cur_size; i++) { t = itr->targets + i; lname = basename((char *)t->name); rname = basename((char *)parent); if (!strcmp(lname, rname)) return -EEXIST; } return vhd_util_scan_init_file_target(target, parent, type); } static int iterator_add_volume(struct iterator *itr, struct target *target, const char *parent, uint8_t type) { int i, err; struct lv *lv; lv = NULL; err = -ENOENT; for (i = 0; i < itr->cur_size; i++) if (!strcmp(parent, itr->targets[i].name)) return -EEXIST; for (i = 0; i < vg.lv_cnt; i++) { err = fnmatch(parent, vg.lvs[i].name, FNM_PATHNAME); if (err != FNM_NOMATCH) { lv = vg.lvs + i; break; } } if (err && err != FNM_PATHNAME) return err; if (!lv) return -ENOENT; return vhd_util_scan_init_volume_target(target, lv, type); } static int iterator_add(struct iterator *itr, const char *parent, uint8_t type) { int err; struct target *target; if (itr->cur_size == itr->max_size) { struct target *new; new = realloc(itr->targets, sizeof(struct target) * itr->max_size * 2); if (!new) return -ENOMEM; itr->max_size *= 2; itr->targets = new; } target = itr->targets + itr->cur_size; if (target_volume(type)) err = iterator_add_volume(itr, target, parent, type); else err = iterator_add_file(itr, target, parent, type); if (err) memset(target, 0, sizeof(*target)); else itr->cur_size++; return (err == -EEXIST ? 0 : err); } static void iterator_free(struct iterator *itr) { free(itr->targets); memset(itr, 0, sizeof(*itr)); } static void vhd_util_scan_add_parent(struct iterator *itr, vhd_context_t *vhd, struct vhd_image *image) { int err; uint8_t type; if (vhd_parent_raw(vhd)) type = target_volume(image->target->type) ? VHD_TYPE_RAW_VOLUME : VHD_TYPE_RAW_FILE; else type = target_volume(image->target->type) ? VHD_TYPE_VHD_VOLUME : VHD_TYPE_VHD_FILE; err = iterator_add(itr, image->parent, type); if (err) vhd_util_scan_error(image->parent, err); } static int vhd_util_scan_targets(int cnt, struct target *targets) { int ret, err; vhd_context_t vhd; struct iterator itr; struct target *target; struct vhd_image image; ret = 0; err = 0; err = iterator_init(&itr, cnt, targets); if (err) return err; while ((target = iterator_next(&itr))) { memset(&vhd, 0, sizeof(vhd)); memset(&image, 0, sizeof(image)); image.target = target; err = vhd_util_scan_open(&vhd, &image); if (err) { ret = -EAGAIN; goto end; } err = vhd_util_scan_get_size(&vhd, &image); if (err) { ret = -EAGAIN; image.message = "getting physical size"; image.error = err; goto end; } err = vhd_util_scan_get_hidden(&vhd, &image); if (err) { ret = -EAGAIN; image.message = "checking 'hidden' field"; image.error = err; goto end; } if (flags & VHD_SCAN_MARKERS) { err = vhd_util_scan_get_marker(&vhd, &image); if (err) { ret = -EAGAIN; image.message = "checking marker"; image.error = err; goto end; } } if (vhd.footer.type == HD_TYPE_DIFF) { err = vhd_util_scan_get_parent(&vhd, &image); if (err) { ret = -EAGAIN; image.message = "getting parent"; image.error = err; goto end; } } end: vhd_util_scan_print_image(&image); if (flags & VHD_SCAN_PARENTS && image.parent) vhd_util_scan_add_parent(&itr, &vhd, &image); if (vhd.file) vhd_close(&vhd); if (image.name != target->name) free(image.name); free(image.parent); if (err && !(flags & VHD_SCAN_NOFAIL)) break; } iterator_free(&itr); if (flags & VHD_SCAN_NOFAIL) return ret; return err; } static int vhd_util_scan_targets_pretty(int cnt, struct target *targets) { int err; err = vhd_util_scan_pretty_allocate_list(cnt); if (err) { printf("scan failed: no memory\n"); return -ENOMEM; } err = vhd_util_scan_targets(cnt, targets); vhd_util_scan_pretty_print_images(); vhd_util_scan_pretty_free_list(); return ((flags & VHD_SCAN_NOFAIL) ? 0 : err); } static int vhd_util_scan_find_file_targets(int cnt, char **names, const char *filter, struct target **_targets, int *_total) { glob_t g; struct target *targets; int i, globs, err, total; total = cnt; globs = 0; *_total = 0; *_targets = NULL; memset(&g, 0, sizeof(g)); if (filter) { int gflags = ((flags & VHD_SCAN_FAST) ? GLOB_NOSORT : 0); errno = 0; err = glob(filter, gflags, vhd_util_scan_error, &g); switch (err) { case GLOB_NOSPACE: err = -ENOMEM; break; case GLOB_ABORTED: err = -EIO; break; case GLOB_NOMATCH: err = -errno; break; } if (err) { vhd_util_scan_error(filter, err); return err; } globs = g.gl_pathc; total += globs; } targets = calloc(total, sizeof(struct target)); if (!targets) { err = -ENOMEM; goto out; } for (i = 0; i < g.gl_pathc; i++) { err = vhd_util_scan_init_file_target(targets + i, g.gl_pathv[i], VHD_TYPE_VHD_FILE); if (err) { vhd_util_scan_error(g.gl_pathv[i], err); if (!(flags & VHD_SCAN_NOFAIL)) goto out; } } for (i = 0; i + globs < total; i++) { err = vhd_util_scan_init_file_target(targets + i + globs, names[i], VHD_TYPE_VHD_FILE); if (err) { vhd_util_scan_error(names[i], err); if (!(flags & VHD_SCAN_NOFAIL)) goto out; } } err = 0; *_total = total; *_targets = targets; out: if (err) free(targets); if (filter) globfree(&g); return err; } static inline void swap_volume(struct lv *lvs, int dst, int src) { struct lv copy, *ldst, *lsrc; if (dst == src) return; lsrc = lvs + src; ldst = lvs + dst; memcpy(©, ldst, sizeof(copy)); memcpy(ldst, lsrc, sizeof(*ldst)); memcpy(lsrc, ©, sizeof(copy)); } static int vhd_util_scan_sort_volumes(struct lv *lvs, int cnt, const char *filter, int *_matches) { struct lv *lv; int i, err, matches; matches = 0; *_matches = 0; if (!filter) return 0; for (i = 0; i < cnt; i++) { lv = lvs + i; err = fnmatch(filter, lv->name, FNM_PATHNAME); if (err) { if (err != FNM_NOMATCH) { EPRINTF("fnmatch failed: '%s', '%s'\n", filter, lv->name); vhd_util_scan_error(lv->name, err); if (!(flags & VHD_SCAN_NOFAIL)) return err; } continue; } swap_volume(lvs, matches++, i); } *_matches = matches; return 0; } static int vhd_util_scan_find_volume_targets(int cnt, char **names, const char *volume, const char *filter, struct target **_targets, int *_total) { struct target *targets; int i, err, total, matches; *_total = 0; *_targets = NULL; targets = NULL; err = lvm_scan_vg(volume, &vg); if (err) return err; err = vhd_util_scan_sort_volumes(vg.lvs, vg.lv_cnt, filter, &matches); if (err) goto out; total = matches; for (i = 0; i < cnt; i++) { err = vhd_util_scan_sort_volumes(vg.lvs + total, vg.lv_cnt - total, names[i], &matches); if (err) goto out; total += matches; } targets = calloc(total, sizeof(struct target)); if (!targets) { err = -ENOMEM; goto out; } for (i = 0; i < total; i++) { err = vhd_util_scan_init_volume_target(targets + i, vg.lvs + i, VHD_TYPE_VHD_VOLUME); if (err) { vhd_util_scan_error(vg.lvs[i].name, err); if (!(flags & VHD_SCAN_NOFAIL)) goto out; } } err = 0; *_total = total; *_targets = targets; out: if (err) free(targets); return err; } static int vhd_util_scan_find_targets(int cnt, char **names, const char *volume, const char *filter, struct target **targets, int *total) { if (flags & VHD_SCAN_VOLUME) return vhd_util_scan_find_volume_targets(cnt, names, volume, filter, targets, total); return vhd_util_scan_find_file_targets(cnt, names, filter, targets, total); } int vhd_util_scan(int argc, char **argv) { int c, err, cnt; char *filter, *volume; struct target *targets; cnt = 0; err = 0; flags = 0; filter = NULL; volume = NULL; targets = NULL; optind = 0; while ((c = getopt(argc, argv, "m:fcl:pavMh")) != -1) { switch (c) { case 'm': filter = optarg; break; case 'f': flags |= VHD_SCAN_FAST; break; case 'c': flags |= VHD_SCAN_NOFAIL; break; case 'l': volume = optarg; flags |= VHD_SCAN_VOLUME; break; case 'p': flags |= VHD_SCAN_PRETTY; break; case 'a': flags |= VHD_SCAN_PARENTS; break; case 'v': flags |= VHD_SCAN_VERBOSE; break; case 'M': flags |= VHD_SCAN_MARKERS; break; case 'h': goto usage; default: err = -EINVAL; goto usage; } } if (!filter && argc - optind == 0) { err = -EINVAL; goto usage; } if (flags & VHD_SCAN_PRETTY) flags &= ~VHD_SCAN_FAST; err = vhd_util_scan_find_targets(argc - optind, argv + optind, volume, filter, &targets, &cnt); if (err) { printf("scan failed: %d\n", err); return err; } if (!cnt) return 0; if (flags & VHD_SCAN_PRETTY) err = vhd_util_scan_targets_pretty(cnt, targets); else err = vhd_util_scan_targets(cnt, targets); free(targets); lvm_free_vg(&vg); return ((flags & VHD_SCAN_NOFAIL) ? 0 : err); usage: printf("usage: [OPTIONS] FILES\n" "options: [-m match filter] [-f fast] [-c continue on failure] " "[-l LVM volume] [-p pretty print] [-a scan parents] " "[-v verbose] [-h help] [-M show markers]\n"); return err; } blktap-2.0.90/vhd/lib/libvhd.c0000644000000000000000000024315711664745551014552 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "libvhd.h" #include "relative-path.h" #define VHD_HEADER_MAX_RETRIES 10 static int libvhd_dbg = 0; void libvhd_set_log_level(int level) { if (level) libvhd_dbg = 1; } #define VHDLOG(_f, _a...) \ do { \ if (libvhd_dbg) \ syslog(LOG_INFO, "libvhd::%s: "_f, \ __func__, ##_a); \ } while (0) #define ASSERT(_p) \ if (!(_p)) { \ libvhd_set_log_level(1); \ VHDLOG("%s:%d: FAILED ASSERTION: '%s'\n", \ __FILE__, __LINE__, #_p); \ *(int*)0 = 0; \ } #ifdef ENABLE_FAILURE_TESTING const char* ENV_VAR_FAIL[NUM_FAIL_TESTS] = { "VHD_UTIL_TEST_FAIL_REPARENT_BEGIN", "VHD_UTIL_TEST_FAIL_REPARENT_LOCATOR", "VHD_UTIL_TEST_FAIL_REPARENT_END", "VHD_UTIL_TEST_FAIL_RESIZE_BEGIN", "VHD_UTIL_TEST_FAIL_RESIZE_DATA_MOVED", "VHD_UTIL_TEST_FAIL_RESIZE_METADATA_MOVED", "VHD_UTIL_TEST_FAIL_RESIZE_END" }; int TEST_FAIL[NUM_FAIL_TESTS]; #endif // ENABLE_FAILURE_TESTING static void vhd_cache_init(vhd_context_t *); static int vhd_cache_enabled(vhd_context_t *); static int vhd_cache_load(vhd_context_t *); static int vhd_cache_unload(vhd_context_t *); static vhd_context_t * vhd_cache_get_parent(vhd_context_t *); static inline int old_test_bit(volatile char *addr, int nr) { return (((uint32_t *)addr)[nr >> 5] >> (nr & 31)) & 1; } static inline void old_set_bit(volatile char *addr, int nr) { ((uint32_t *)addr)[nr >> 5] |= (1 << (nr & 31)); } static inline void old_clear_bit(volatile char *addr, int nr) { ((uint32_t *)addr)[nr >> 5] &= ~(1 << (nr & 31)); } void vhd_footer_in(vhd_footer_t *footer) { BE32_IN(&footer->features); BE32_IN(&footer->ff_version); BE64_IN(&footer->data_offset); BE32_IN(&footer->timestamp); BE32_IN(&footer->crtr_ver); BE32_IN(&footer->crtr_os); BE64_IN(&footer->orig_size); BE64_IN(&footer->curr_size); BE32_IN(&footer->geometry); BE32_IN(&footer->type); BE32_IN(&footer->checksum); } void vhd_footer_out(vhd_footer_t *footer) { BE32_OUT(&footer->features); BE32_OUT(&footer->ff_version); BE64_OUT(&footer->data_offset); BE32_OUT(&footer->timestamp); BE32_OUT(&footer->crtr_ver); BE32_OUT(&footer->crtr_os); BE64_OUT(&footer->orig_size); BE64_OUT(&footer->curr_size); BE32_OUT(&footer->geometry); BE32_OUT(&footer->type); BE32_OUT(&footer->checksum); } void vhd_header_in(vhd_header_t *header) { int i, n; BE64_IN(&header->data_offset); BE64_IN(&header->table_offset); BE32_IN(&header->hdr_ver); BE32_IN(&header->max_bat_size); BE32_IN(&header->block_size); BE32_IN(&header->checksum); BE32_IN(&header->prt_ts); n = sizeof(header->loc) / sizeof(vhd_parent_locator_t); for (i = 0; i < n; i++) { BE32_IN(&header->loc[i].code); BE32_IN(&header->loc[i].data_space); BE32_IN(&header->loc[i].data_len); BE64_IN(&header->loc[i].data_offset); } } void vhd_header_out(vhd_header_t *header) { int i, n; BE64_OUT(&header->data_offset); BE64_OUT(&header->table_offset); BE32_OUT(&header->hdr_ver); BE32_OUT(&header->max_bat_size); BE32_OUT(&header->block_size); BE32_OUT(&header->checksum); BE32_OUT(&header->prt_ts); n = sizeof(header->loc) / sizeof(vhd_parent_locator_t); for (i = 0; i < n; i++) { BE32_OUT(&header->loc[i].code); BE32_OUT(&header->loc[i].data_space); BE32_OUT(&header->loc[i].data_len); BE64_OUT(&header->loc[i].data_offset); } } void vhd_batmap_header_in(vhd_batmap_t *batmap) { BE64_IN(&batmap->header.batmap_offset); BE32_IN(&batmap->header.batmap_size); BE32_IN(&batmap->header.batmap_version); BE32_IN(&batmap->header.checksum); } void vhd_batmap_header_out(vhd_batmap_t *batmap) { BE64_OUT(&batmap->header.batmap_offset); BE32_OUT(&batmap->header.batmap_size); BE32_OUT(&batmap->header.batmap_version); BE32_OUT(&batmap->header.checksum); } void vhd_bat_in(vhd_bat_t *bat) { int i; for (i = 0; i < bat->entries; i++) BE32_IN(&bat->bat[i]); } void vhd_bat_out(vhd_bat_t *bat) { int i; for (i = 0; i < bat->entries; i++) BE32_OUT(&bat->bat[i]); } uint32_t vhd_checksum_footer(vhd_footer_t *footer) { int i; unsigned char *blob; uint32_t checksum, tmp; checksum = 0; tmp = footer->checksum; footer->checksum = 0; blob = (unsigned char *)footer; for (i = 0; i < sizeof(vhd_footer_t); i++) checksum += (uint32_t)blob[i]; footer->checksum = tmp; return ~checksum; } int vhd_validate_footer(vhd_footer_t *footer) { int csize; uint32_t checksum; csize = sizeof(footer->cookie); if (memcmp(footer->cookie, HD_COOKIE, csize) != 0 && memcmp(footer->cookie, VHD_POISON_COOKIE, csize) != 0) { char buf[9]; memcpy(buf, footer->cookie, 8); buf[8]= '\0'; VHDLOG("invalid footer cookie: %s\n", buf); return -EINVAL; } checksum = vhd_checksum_footer(footer); if (checksum != footer->checksum) { /* * early td-util did not re-calculate * checksum when marking vhds 'hidden' */ if (footer->hidden && !strncmp(footer->crtr_app, "tap", 3) && (footer->crtr_ver == VHD_VERSION(0, 1) || footer->crtr_ver == VHD_VERSION(1, 1))) { char tmp = footer->hidden; footer->hidden = 0; checksum = vhd_checksum_footer(footer); footer->hidden = tmp; if (checksum == footer->checksum) return 0; } VHDLOG("invalid footer checksum: " "footer = 0x%08x, calculated = 0x%08x\n", footer->checksum, checksum); return -EINVAL; } return 0; } uint32_t vhd_checksum_header(vhd_header_t *header) { int i; unsigned char *blob; uint32_t checksum, tmp; checksum = 0; tmp = header->checksum; header->checksum = 0; blob = (unsigned char *)header; for (i = 0; i < sizeof(vhd_header_t); i++) checksum += (uint32_t)blob[i]; header->checksum = tmp; return ~checksum; } int vhd_validate_header(vhd_header_t *header) { int i, n; uint32_t checksum; if (memcmp(header->cookie, DD_COOKIE, 8) != 0) { char buf[9]; memcpy(buf, header->cookie, 8); buf[8] = '\0'; VHDLOG("invalid header cookie: %s\n", buf); return -EINVAL; } if (header->hdr_ver != 0x00010000) { VHDLOG("invalid header version 0x%08x\n", header->hdr_ver); return -EINVAL; } if (header->data_offset != 0xFFFFFFFFFFFFFFFFULL) { VHDLOG("invalid header data_offset 0x%016"PRIx64"\n", header->data_offset); return -EINVAL; } n = sizeof(header->loc) / sizeof(vhd_parent_locator_t); for (i = 0; i < n; i++) if (vhd_validate_platform_code(header->loc[i].code)) return -EINVAL; checksum = vhd_checksum_header(header); if (checksum != header->checksum) { VHDLOG("invalid header checksum: " "header = 0x%08x, calculated = 0x%08x\n", header->checksum, checksum); return -EINVAL; } return 0; } static inline int vhd_validate_bat(vhd_bat_t *bat) { if (!bat->bat) return -EINVAL; return 0; } uint32_t vhd_checksum_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap) { int i; char *blob; uint32_t checksum; size_t map_size; blob = batmap->map; checksum = 0; map_size = vhd_sectors_to_bytes(secs_round_up_no_zero( ctx->footer.curr_size >> (VHD_BLOCK_SHIFT + 3))); for (i = 0; i < map_size; i++) { if (batmap->header.batmap_version == VHD_BATMAP_VERSION(1, 1)) checksum += (uint32_t)blob[i]; else checksum += (uint32_t)(unsigned char)blob[i]; } return ~checksum; } int vhd_validate_batmap_header(vhd_batmap_t *batmap) { if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, 8)) return -EINVAL; if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION) return -EINVAL; return 0; } int vhd_validate_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap) { uint32_t checksum; if (!batmap->map) return -EINVAL; checksum = vhd_checksum_batmap(ctx, batmap); if (checksum != batmap->header.checksum) return -EINVAL; return 0; } int vhd_batmap_header_offset(vhd_context_t *ctx, off64_t *_off) { off64_t off; size_t bat; *_off = 0; off = ctx->header.table_offset; bat = ctx->header.max_bat_size * sizeof(uint32_t); off += vhd_bytes_padded(bat); *_off = off; return 0; } int vhd_validate_platform_code(uint32_t code) { switch (code) { case PLAT_CODE_NONE: case PLAT_CODE_WI2R: case PLAT_CODE_WI2K: case PLAT_CODE_W2RU: case PLAT_CODE_W2KU: case PLAT_CODE_MAC: case PLAT_CODE_MACX: return 0; default: VHDLOG("invalid parent locator code %u\n", code); return -EINVAL; } } int vhd_parent_locator_count(vhd_context_t *ctx) { return (sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t)); } int vhd_hidden(vhd_context_t *ctx, int *hidden) { int err; *hidden = 0; if (vhd_type_dynamic(ctx) && vhd_creator_tapdisk(ctx) && (ctx->footer.crtr_ver == VHD_VERSION(0, 1) || ctx->footer.crtr_ver == VHD_VERSION(1, 1))) { vhd_footer_t copy; err = vhd_read_footer_at(ctx, ©, 0); if (err) { VHDLOG("error reading backup footer of %s: %d\n", ctx->file, err); return err; } *hidden = copy.hidden; } else *hidden = ctx->footer.hidden; return 0; } int vhd_chain_depth(vhd_context_t *ctx, int *depth) { char *file; int err, cnt; vhd_context_t vhd, *cur; err = 0; cnt = 0; *depth = 0; file = NULL; cur = ctx; for (;;) { cnt++; if (cur->footer.type != HD_TYPE_DIFF) break; if (vhd_parent_raw(cur)) { cnt++; break; } err = vhd_parent_locator_get(cur, &file); if (err) { file = NULL; break; } if (cur != ctx) { vhd_close(cur); cur = NULL; } err = vhd_open(&vhd, file, VHD_OPEN_RDONLY); if (err) break; cur = &vhd; free(file); file = NULL; } free(file); if (cur && cur != ctx) vhd_close(cur); if (!err) *depth = cnt; return err; } int vhd_batmap_test(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block) { if (!vhd_has_batmap(ctx) || !batmap->map) return 0; if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3))) return 0; return test_bit(batmap->map, block); } void vhd_batmap_set(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block) { if (!vhd_has_batmap(ctx) || !batmap->map) return; if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3))) return; set_bit(batmap->map, block); } void vhd_batmap_clear(vhd_context_t *ctx, vhd_batmap_t *batmap, uint32_t block) { if (!vhd_has_batmap(ctx) || !batmap->map) return; if (block >= (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3))) return; clear_bit(batmap->map, block); } int vhd_bitmap_test(vhd_context_t *ctx, char *map, uint32_t block) { if (vhd_creator_tapdisk(ctx) && ctx->footer.crtr_ver == 0x00000001) return old_test_bit(map, block); return test_bit(map, block); } void vhd_bitmap_set(vhd_context_t *ctx, char *map, uint32_t block) { if (vhd_creator_tapdisk(ctx) && ctx->footer.crtr_ver == 0x00000001) return old_set_bit(map, block); return set_bit(map, block); } void vhd_bitmap_clear(vhd_context_t *ctx, char *map, uint32_t block) { if (vhd_creator_tapdisk(ctx) && ctx->footer.crtr_ver == 0x00000001) return old_clear_bit(map, block); return clear_bit(map, block); } /* * returns absolute offset of the first * byte of the file which is not vhd metadata */ int vhd_end_of_headers(vhd_context_t *ctx, off64_t *end) { int err, i, n; uint32_t bat_bytes; off64_t eom, bat_end; vhd_parent_locator_t *loc; *end = 0; if (!vhd_type_dynamic(ctx)) return 0; eom = ctx->footer.data_offset + sizeof(vhd_header_t); bat_bytes = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t)); bat_end = ctx->header.table_offset + bat_bytes; eom = MAX(eom, bat_end); if (vhd_has_batmap(ctx)) { off64_t hdr_end, hdr_secs, map_end, map_secs; err = vhd_get_batmap(ctx); if (err) return err; hdr_secs = secs_round_up_no_zero(sizeof(vhd_batmap_header_t)); err = vhd_batmap_header_offset(ctx, &hdr_end); if (err) return err; hdr_end += vhd_sectors_to_bytes(hdr_secs); eom = MAX(eom, hdr_end); map_secs = ctx->batmap.header.batmap_size; map_end = (ctx->batmap.header.batmap_offset + vhd_sectors_to_bytes(map_secs)); eom = MAX(eom, map_end); } /* parent locators */ n = sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t); for (i = 0; i < n; i++) { off64_t loc_end; loc = &ctx->header.loc[i]; if (loc->code == PLAT_CODE_NONE) continue; loc_end = loc->data_offset + vhd_parent_locator_size(loc); eom = MAX(eom, loc_end); } *end = eom; return 0; } int vhd_end_of_data(vhd_context_t *ctx, off64_t *end) { int i, err; off64_t max; uint64_t blk; if (!vhd_type_dynamic(ctx)) { err = vhd_seek(ctx, 0, SEEK_END); if (err) return err; max = vhd_position(ctx); if (max == (off64_t)-1) return -errno; *end = max - sizeof(vhd_footer_t); return 0; } err = vhd_end_of_headers(ctx, &max); if (err) return err; err = vhd_get_bat(ctx); if (err) return err; max >>= VHD_SECTOR_SHIFT; for (i = 0; i < ctx->bat.entries; i++) { blk = ctx->bat.bat[i]; if (blk != DD_BLK_UNUSED) { blk += ctx->spb + ctx->bm_secs; max = MAX(blk, max); } } *end = vhd_sectors_to_bytes(max); return 0; } uint32_t vhd_time(time_t time) { struct tm tm; time_t micro_epoch; memset(&tm, 0, sizeof(struct tm)); tm.tm_year = 100; tm.tm_mon = 0; tm.tm_mday = 1; micro_epoch = mktime(&tm); return (uint32_t)(time - micro_epoch); } /* * Stringify the VHD timestamp for printing. * As with ctime_r, target must be >=26 bytes. */ size_t vhd_time_to_string(uint32_t timestamp, char *target) { char *cr; struct tm tm; time_t t1, t2; memset(&tm, 0, sizeof(struct tm)); /* VHD uses an epoch of 12:00AM, Jan 1, 2000. */ /* Need to adjust this to the expected epoch of 1970. */ tm.tm_year = 100; tm.tm_mon = 0; tm.tm_mday = 1; t1 = mktime(&tm); t2 = t1 + (time_t)timestamp; ctime_r(&t2, target); /* handle mad ctime_r newline appending. */ if ((cr = strchr(target, '\n')) != NULL) *cr = '\0'; return (strlen(target)); } /* * nabbed from vhd specs. */ uint32_t vhd_chs(uint64_t size) { uint32_t secs, cylinders, heads, spt, cth; secs = secs_round_up_no_zero(size); if (secs > 65535 * 16 * 255) secs = 65535 * 16 * 255; if (secs >= 65535 * 16 * 63) { spt = 255; cth = secs / spt; heads = 16; } else { spt = 17; cth = secs / spt; heads = (cth + 1023) / 1024; if (heads < 4) heads = 4; if (cth >= (heads * 1024) || heads > 16) { spt = 31; cth = secs / spt; heads = 16; } if (cth >= heads * 1024) { spt = 63; cth = secs / spt; heads = 16; } } cylinders = cth / heads; return GEOM_ENCODE(cylinders, heads, spt); } int vhd_get_footer(vhd_context_t *ctx) { if (!vhd_validate_footer(&ctx->footer)) return 0; return vhd_read_footer(ctx, &ctx->footer); } int vhd_get_header(vhd_context_t *ctx) { if (!vhd_type_dynamic(ctx)) return -EINVAL; if (!vhd_validate_header(&ctx->header)) return 0; return vhd_read_header(ctx, &ctx->header); } int vhd_get_bat(vhd_context_t *ctx) { if (!vhd_type_dynamic(ctx)) return -EINVAL; if (!vhd_validate_bat(&ctx->bat)) return 0; vhd_put_bat(ctx); return vhd_read_bat(ctx, &ctx->bat); } int vhd_get_batmap(vhd_context_t *ctx) { if (!vhd_has_batmap(ctx)) return -EINVAL; if (!vhd_validate_batmap(ctx, &ctx->batmap)) return 0; vhd_put_batmap(ctx); return vhd_read_batmap(ctx, &ctx->batmap); } void vhd_put_footer(vhd_context_t *ctx) { memset(&ctx->footer, 0, sizeof(vhd_footer_t)); } void vhd_put_header(vhd_context_t *ctx) { memset(&ctx->header, 0, sizeof(vhd_header_t)); } void vhd_put_bat(vhd_context_t *ctx) { if (!vhd_type_dynamic(ctx)) return; free(ctx->bat.bat); memset(&ctx->bat, 0, sizeof(vhd_bat_t)); } void vhd_put_batmap(vhd_context_t *ctx) { if (!vhd_type_dynamic(ctx)) return; if (!vhd_has_batmap(ctx)) return; free(ctx->batmap.map); memset(&ctx->batmap, 0, sizeof(vhd_batmap_t)); } /* * look for 511 byte footer at end of file */ int vhd_read_short_footer(vhd_context_t *ctx, vhd_footer_t *footer) { off64_t eof; void *buf; int err; buf = NULL; err = vhd_seek(ctx, 0, SEEK_END); if (err) goto out; eof = vhd_position(ctx); if (eof == (off64_t)-1) { err = -errno; goto out; } err = vhd_seek(ctx, eof - 511, SEEK_SET); if (err) goto out; err = posix_memalign(&buf, VHD_SECTOR_SIZE, sizeof(vhd_footer_t)); if (err) { buf = NULL; err = -err; goto out; } memset(buf, 0, sizeof(vhd_footer_t)); /* * expecting short read here */ vhd_read(ctx, buf, sizeof(vhd_footer_t)); memcpy(footer, buf, sizeof(vhd_footer_t)); vhd_footer_in(footer); err = vhd_validate_footer(footer); out: if (err) VHDLOG("%s: failed reading short footer: %d\n", ctx->file, err); free(buf); return err; } int vhd_read_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off64_t off) { void *buf; int err; buf = NULL; err = vhd_seek(ctx, off, SEEK_SET); if (err) goto out; err = posix_memalign(&buf, VHD_SECTOR_SIZE, sizeof(vhd_footer_t)); if (err) { buf = NULL; err = -err; goto out; } err = vhd_read(ctx, buf, sizeof(vhd_footer_t)); if (err) goto out; memcpy(footer, buf, sizeof(vhd_footer_t)); vhd_footer_in(footer); err = vhd_validate_footer(footer); out: if (err) VHDLOG("%s: reading footer at 0x%08"PRIx64" failed: %d\n", ctx->file, off, err); free(buf); return err; } int vhd_read_footer(vhd_context_t *ctx, vhd_footer_t *footer) { int err; off64_t off; err = vhd_seek(ctx, 0, SEEK_END); if (err) return err; off = vhd_position(ctx); if (off == (off64_t)-1) return -errno; err = vhd_read_footer_at(ctx, footer, off - 512); if (err != -EINVAL) return err; err = vhd_read_short_footer(ctx, footer); if (err != -EINVAL) return err; /* * Disable the enforcement of VHD_OPEN_STRICT until we figure out how * to recover from crashes. Note that we never enforced it before * anyways due to a bug (CA-28285) and everything was ok. */ /* if (ctx->oflags & VHD_OPEN_STRICT) return -EINVAL; */ return vhd_read_footer_at(ctx, footer, 0); } int vhd_read_header_at(vhd_context_t *ctx, vhd_header_t *header, off64_t off) { void *buf; int err; buf = NULL; if (!vhd_type_dynamic(ctx)) { err = -EINVAL; goto out; } err = vhd_seek(ctx, off, SEEK_SET); if (err) goto out; err = posix_memalign(&buf, VHD_SECTOR_SIZE, sizeof(vhd_header_t)); if (err) { buf = NULL; err = -err; goto out; } err = vhd_read(ctx, buf, sizeof(vhd_header_t)); if (err) goto out; memcpy(header, buf, sizeof(vhd_header_t)); vhd_header_in(header); err = vhd_validate_header(header); out: if (err) VHDLOG("%s: reading header at 0x%08"PRIx64" failed: %d\n", ctx->file, off, err); free(buf); return err; } int vhd_read_header(vhd_context_t *ctx, vhd_header_t *header) { off64_t off; if (!vhd_type_dynamic(ctx)) { VHDLOG("%s is not dynamic!\n", ctx->file); return -EINVAL; } off = ctx->footer.data_offset; return vhd_read_header_at(ctx, header, off); } int vhd_read_bat(vhd_context_t *ctx, vhd_bat_t *bat) { int err; void *buf; off64_t off; uint32_t vhd_blks; size_t size; buf = NULL; if (!vhd_type_dynamic(ctx)) { err = -EINVAL; goto fail; } off = ctx->header.table_offset; /* The BAT size is stored in ctx->header.max_bat_size. However, we * sometimes preallocate BAT + batmap for max VHD size, so only read in * the BAT entries that are in use for curr_size */ vhd_blks = ctx->footer.curr_size >> VHD_BLOCK_SHIFT; ASSERT(ctx->header.max_bat_size >= vhd_blks); size = vhd_bytes_padded(vhd_blks * sizeof(uint32_t)); err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) { buf = NULL; err = -err; goto fail; } err = vhd_seek(ctx, off, SEEK_SET); if (err) goto fail; err = vhd_read(ctx, buf, size); if (err) goto fail; bat->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT; bat->entries = vhd_blks; bat->bat = (uint32_t *)buf; vhd_bat_in(bat); return 0; fail: free(buf); memset(bat, 0, sizeof(vhd_bat_t)); VHDLOG("%s: failed to read bat: %d\n", ctx->file, err); return err; } static int vhd_read_batmap_header(vhd_context_t *ctx, vhd_batmap_t *batmap) { int err; void *buf; off64_t off; size_t size; buf = NULL; err = vhd_batmap_header_offset(ctx, &off); if (err) goto fail; err = vhd_seek(ctx, off, SEEK_SET); if (err) goto fail; size = vhd_bytes_padded(sizeof(vhd_batmap_header_t)); err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) { buf = NULL; err = -err; goto fail; } err = vhd_read(ctx, buf, size); if (err) goto fail; memcpy(&batmap->header, buf, sizeof(vhd_batmap_header_t)); free(buf); buf = NULL; vhd_batmap_header_in(batmap); return 0; fail: free(buf); memset(&batmap->header, 0, sizeof(vhd_batmap_header_t)); VHDLOG("%s: failed to read batmap header: %d\n", ctx->file, err); return err; } static int vhd_read_batmap_map(vhd_context_t *ctx, vhd_batmap_t *batmap) { int err; void *buf; off64_t off; size_t map_size; map_size = vhd_sectors_to_bytes(secs_round_up_no_zero( ctx->footer.curr_size >> (VHD_BLOCK_SHIFT + 3))); ASSERT(vhd_sectors_to_bytes(batmap->header.batmap_size) >= map_size); err = posix_memalign(&buf, VHD_SECTOR_SIZE, map_size); if (err) { buf = NULL; err = -err; goto fail; } off = batmap->header.batmap_offset; err = vhd_seek(ctx, off, SEEK_SET); if (err) goto fail; err = vhd_read(ctx, buf, map_size); if (err) goto fail; batmap->map = buf; return 0; fail: free(buf); batmap->map = NULL; VHDLOG("%s: failed to read batmap: %d\n", ctx->file, err); return err; } int vhd_read_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap) { int err; if (!vhd_has_batmap(ctx)) return -EINVAL; memset(batmap, 0, sizeof(vhd_batmap_t)); err = vhd_read_batmap_header(ctx, batmap); if (err) return err; err = vhd_validate_batmap_header(batmap); if (err) return err; err = vhd_read_batmap_map(ctx, batmap); if (err) return err; err = vhd_validate_batmap(ctx, batmap); if (err) goto fail; return 0; fail: free(batmap->map); memset(batmap, 0, sizeof(vhd_batmap_t)); return err; } int vhd_has_batmap(vhd_context_t *ctx) { if (!vhd_type_dynamic(ctx)) return 0; if (!vhd_creator_tapdisk(ctx)) return 0; if (ctx->footer.crtr_ver <= VHD_VERSION(0, 1)) return 0; if (ctx->footer.crtr_ver >= VHD_VERSION(1, 2)) return 1; /* * VHDs of version 1.1 probably have a batmap, but may not * if they were updated from version 0.1 via vhd-update. */ if (!vhd_validate_batmap_header(&ctx->batmap)) return 1; if (vhd_read_batmap_header(ctx, &ctx->batmap)) return 0; return (!vhd_validate_batmap_header(&ctx->batmap)); } /* * Is this a block device (with a fixed size)? This affects whether the file * can be truncated and where the footer is written for VHDs. */ int vhd_test_file_fixed(const char *file, int *is_block) { int err; struct stat stats; err = stat(file, &stats); if (err == -1) return -errno; *is_block = !!(S_ISBLK(stats.st_mode)); return err; } int vhd_find_parent(vhd_context_t *ctx, const char *parent, char **_location) { char *location, __location[PATH_MAX]; char *cpath, __cpath[PATH_MAX]; char *cdir, *path; int err; err = 0; path = NULL; cpath = NULL; location = NULL; *_location = NULL; if (!parent) return -EINVAL; if (parent[0] == '/') { if (!access(parent, R_OK)) { *_location = strdup(parent); if (!*_location) return -errno; return 0; } } /* check parent path relative to child's directory */ cpath = realpath(ctx->file, __cpath); if (!cpath) { err = -errno; goto out; } cdir = dirname(cpath); if (asprintf(&location, "%s/%s", cdir, parent) == -1) { err = -errno; location = NULL; goto out; } if (!access(location, R_OK)) { path = realpath(location, __location); if (path) { *_location = strdup(path); if (!*_location) return -errno; return 0; } } out: return err; } int vhd_macx_encode_location(char *name, char **out, int *outlen) { iconv_t cd; int len, err; size_t ibl, obl; char *uri, *urip, *uri_utf8, *uri_utf8p, *ret; err = 0; ret = NULL; *out = NULL; *outlen = 0; len = strlen(name) + strlen("file://"); ibl = len; obl = len; uri = urip = malloc(ibl + 1); uri_utf8 = uri_utf8p = malloc(obl); if (!uri || !uri_utf8) return -ENOMEM; cd = iconv_open("UTF-8", "ASCII"); if (cd == (iconv_t)-1) { err = -errno; goto out; } sprintf(uri, "file://%s", name); if (iconv(cd, &urip, &ibl, &uri_utf8p, &obl) == (size_t)-1 || ibl || obl) { err = (errno ? -errno : -EIO); goto out; } ret = malloc(len); if (!ret) { err = -ENOMEM; goto out; } memcpy(ret, uri_utf8, len); *outlen = len; *out = ret; out: free(uri); free(uri_utf8); if (cd != (iconv_t)-1) iconv_close(cd); return err; } int vhd_w2u_encode_location(char *name, char **out, int *outlen) { iconv_t cd; int len, err; size_t ibl, obl; char *uri, *urip, *uri_utf16, *uri_utf16p, *tmp, *ret; err = 0; ret = NULL; *out = NULL; *outlen = 0; cd = (iconv_t) -1; /* * MICROSOFT_COMPAT * relative paths must start with ".\" */ if (name[0] != '/') { tmp = strstr(name, "./"); if (tmp == name) tmp += strlen("./"); else tmp = name; err = asprintf(&uri, ".\\%s", tmp); } else err = asprintf(&uri, "%s", name); if (err == -1) return -ENOMEM; tmp = uri; while (*tmp != '\0') { if (*tmp == '/') *tmp = '\\'; tmp++; } len = strlen(uri); ibl = len; obl = len * 2; urip = uri; uri_utf16 = uri_utf16p = malloc(obl); if (!uri_utf16) { err = -ENOMEM; goto out; } /* * MICROSOFT_COMPAT * little endian unicode here */ cd = iconv_open("UTF-16LE", "ASCII"); if (cd == (iconv_t)-1) { err = -errno; goto out; } if (iconv(cd, &urip, &ibl, &uri_utf16p, &obl) == (size_t)-1 || ibl || obl) { err = (errno ? -errno : -EIO); goto out; } len = len * 2; ret = malloc(len); if (!ret) { err = -ENOMEM; goto out; } memcpy(ret, uri_utf16, len); *outlen = len; *out = ret; err = 0; out: free(uri); free(uri_utf16); if (cd != (iconv_t)-1) iconv_close(cd); return err; } static char * vhd_macx_decode_location(char *in, char *out, int len) { iconv_t cd; char *name; size_t ibl, obl; name = out; ibl = obl = len; cd = iconv_open("ASCII", "UTF-8"); if (cd == (iconv_t)-1) return NULL; if (iconv(cd, &in, &ibl, &out, &obl) == (size_t)-1 || ibl) return NULL; iconv_close(cd); *out = '\0'; if (strstr(name, "file://") != name) return NULL; name += strlen("file://"); return strdup(name); } static char * vhd_w2u_decode_location(char *in, char *out, int len, char *utf_type) { iconv_t cd; char *name, *tmp; size_t ibl, obl; tmp = name = out; ibl = obl = len; cd = iconv_open("ASCII", utf_type); if (cd == (iconv_t)-1) return NULL; if (iconv(cd, &in, &ibl, &out, &obl) == (size_t)-1 || ibl) return NULL; iconv_close(cd); *out = '\0'; /* TODO: spaces */ while (tmp != out) { if (*tmp == '\\') *tmp = '/'; tmp++; } if (strstr(name, "C:") == name || strstr(name, "c:") == name) name += strlen("c:"); return strdup(name); } int vhd_header_decode_parent(vhd_context_t *ctx, vhd_header_t *header, char **buf) { char *code, out[512]; if (vhd_creator_tapdisk(ctx) && ctx->footer.crtr_ver == VHD_VERSION(0, 1)) code = UTF_16; else code = UTF_16BE; *buf = vhd_w2u_decode_location(header->prt_name, out, 512, code); return (*buf == NULL ? -EINVAL : 0); } int vhd_parent_locator_read(vhd_context_t *ctx, vhd_parent_locator_t *loc, char **parent) { int err, size; void *raw, *out, *name; raw = NULL; out = NULL; name = NULL; *parent = NULL; if (ctx->footer.type != HD_TYPE_DIFF) { err = -EINVAL; goto out; } switch (loc->code) { case PLAT_CODE_MACX: case PLAT_CODE_W2KU: case PLAT_CODE_W2RU: break; default: err = -EINVAL; goto out; } err = vhd_seek(ctx, loc->data_offset, SEEK_SET); if (err) goto out; size = vhd_parent_locator_size(loc); if (size <= 0) { err = -EINVAL; goto out; } err = posix_memalign(&raw, VHD_SECTOR_SIZE, size); if (err) { raw = NULL; err = -err; goto out; } err = vhd_read(ctx, raw, size); if (err) goto out; out = malloc(loc->data_len + 1); if (!out) { err = -ENOMEM; goto out; } switch (loc->code) { case PLAT_CODE_MACX: name = vhd_macx_decode_location(raw, out, loc->data_len); break; case PLAT_CODE_W2KU: case PLAT_CODE_W2RU: name = vhd_w2u_decode_location(raw, out, loc->data_len, UTF_16LE); break; } if (!name) { err = -EINVAL; goto out; } err = 0; *parent = name; out: free(raw); free(out); if (err) { VHDLOG("%s: error reading parent locator: %d\n", ctx->file, err); VHDLOG("%s: locator: code %u, space 0x%x, len 0x%x, " "off 0x%"PRIx64"\n", ctx->file, loc->code, loc->data_space, loc->data_len, loc->data_offset); } return err; } int vhd_parent_locator_get(vhd_context_t *ctx, char **parent) { int i, n, err; char *name, *location; vhd_parent_locator_t *loc; err = -EINVAL; *parent = NULL; if (ctx->footer.type != HD_TYPE_DIFF) return -EINVAL; n = vhd_parent_locator_count(ctx); for (i = 0; i < n; i++) { int _err; loc = ctx->header.loc + i; _err = vhd_parent_locator_read(ctx, loc, &name); if (_err) continue; err = vhd_find_parent(ctx, name, &location); if (err) VHDLOG("%s: couldn't find parent %s (%d)\n", ctx->file, name, err); free(name); if (!err) { *parent = location; return 0; } } return err; } int vhd_parent_locator_write_at(vhd_context_t *ctx, const char *parent, off64_t off, uint32_t code, size_t max_bytes, vhd_parent_locator_t *loc) { struct stat stats; int err, len, size; char *absolute_path, *relative_path, *encoded; char __parent[PATH_MAX]; void *block; memset(loc, 0, sizeof(vhd_parent_locator_t)); if (ctx->footer.type != HD_TYPE_DIFF) return -EINVAL; absolute_path = NULL; relative_path = NULL; encoded = NULL; block = NULL; size = 0; len = 0; switch (code) { case PLAT_CODE_MACX: case PLAT_CODE_W2KU: case PLAT_CODE_W2RU: break; default: return -EINVAL; } absolute_path = realpath(parent, __parent); if (!absolute_path) { err = -errno; goto out; } err = stat(absolute_path, &stats); if (err) { err = -errno; goto out; } if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) { err = -EINVAL; goto out; } relative_path = relative_path_to(ctx->file, absolute_path, &err); if (!relative_path || err) { err = (err ? err : -EINVAL); goto out; } switch (code) { case PLAT_CODE_MACX: err = vhd_macx_encode_location(relative_path, &encoded, &len); break; case PLAT_CODE_W2KU: case PLAT_CODE_W2RU: err = vhd_w2u_encode_location(relative_path, &encoded, &len); break; default: err = -EINVAL; } if (err) goto out; err = vhd_seek(ctx, off, SEEK_SET); if (err) goto out; size = vhd_bytes_padded(len); if (max_bytes && size > max_bytes) { err = -ENAMETOOLONG; goto out; } err = posix_memalign(&block, VHD_SECTOR_SIZE, size); if (err) { block = NULL; err = -err; goto out; } memset(block, 0, size); memcpy(block, encoded, len); err = vhd_write(ctx, block, size); if (err) goto out; err = 0; out: free(relative_path); free(encoded); free(block); if (!err) { loc->res = 0; loc->code = code; loc->data_len = len; /* * write number of bytes ('size') instead of number of sectors * into loc->data_space to be compatible with MSFT, even though * this goes against the specs */ loc->data_space = size; loc->data_offset = off; } return err; } static int vhd_footer_offset_at_eof(vhd_context_t *ctx, off64_t *off) { int err; if ((err = vhd_seek(ctx, 0, SEEK_END))) return errno; *off = vhd_position(ctx) - sizeof(vhd_footer_t); return 0; } int vhd_read_bitmap(vhd_context_t *ctx, uint32_t block, char **bufp) { int err; void *buf; size_t size; off64_t off; uint64_t blk; buf = NULL; *bufp = NULL; if (!vhd_type_dynamic(ctx)) return -EINVAL; err = vhd_get_bat(ctx); if (err) return err; if (block >= ctx->bat.entries) return -ERANGE; blk = ctx->bat.bat[block]; if (blk == DD_BLK_UNUSED) return -EINVAL; off = vhd_sectors_to_bytes(blk); size = vhd_bytes_padded(ctx->spb >> 3); err = vhd_seek(ctx, off, SEEK_SET); if (err) return err; err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) return -err; err = vhd_read(ctx, buf, size); if (err) goto fail; *bufp = buf; return 0; fail: free(buf); return err; } int vhd_read_block(vhd_context_t *ctx, uint32_t block, char **bufp) { int err; void *buf; size_t size; uint64_t blk; off64_t end, off; buf = NULL; *bufp = NULL; if (!vhd_type_dynamic(ctx)) return -EINVAL; err = vhd_get_bat(ctx); if (err) return err; if (block >= ctx->bat.entries) return -ERANGE; blk = ctx->bat.bat[block]; if (blk == DD_BLK_UNUSED) return -EINVAL; off = vhd_sectors_to_bytes(blk + ctx->bm_secs); size = vhd_sectors_to_bytes(ctx->spb); err = vhd_footer_offset_at_eof(ctx, &end); if (err) return err; err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) { err = -err; goto fail; } if (end < off + ctx->header.block_size) { size = end - off; memset(buf + size, 0, ctx->header.block_size - size); } err = vhd_seek(ctx, off, SEEK_SET); if (err) goto fail; err = vhd_read(ctx, buf, size); if (err) goto fail; *bufp = buf; return 0; fail: free(buf); return err; } int vhd_write_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off64_t off) { int err; void *buf; vhd_footer_t *f; f = NULL; err = posix_memalign(&buf, VHD_SECTOR_SIZE, sizeof(vhd_footer_t)); if (err) { err = -err; goto out; } f = buf; memcpy(f, footer, sizeof(vhd_footer_t)); f->checksum = vhd_checksum_footer(f); err = vhd_validate_footer(f); if (err) goto out; err = vhd_seek(ctx, off, SEEK_SET); if (err) goto out; vhd_footer_out(f); err = vhd_write(ctx, f, sizeof(vhd_footer_t)); out: if (err) VHDLOG("%s: failed writing footer at 0x%08"PRIx64": %d\n", ctx->file, off, err); free(f); return err; } int vhd_write_footer(vhd_context_t *ctx, vhd_footer_t *footer) { int err; off64_t off; if (ctx->is_block) err = vhd_footer_offset_at_eof(ctx, &off); else err = vhd_end_of_data(ctx, &off); if (err) return err; err = vhd_write_footer_at(ctx, footer, off); if (err) return err; if (!ctx->is_block) { err = ftruncate(ctx->fd, off + sizeof(vhd_footer_t)); if (err) return -errno; } if (!vhd_type_dynamic(ctx)) return 0; return vhd_write_footer_at(ctx, footer, 0); } int vhd_write_header_at(vhd_context_t *ctx, vhd_header_t *header, off64_t off) { int err; vhd_header_t *h; void *buf; h = NULL; if (!vhd_type_dynamic(ctx)) { err = -EINVAL; goto out; } err = posix_memalign(&buf, VHD_SECTOR_SIZE, sizeof(vhd_header_t)); if (err) { err = -err; goto out; } h = buf; memcpy(h, header, sizeof(vhd_header_t)); h->checksum = vhd_checksum_header(h); err = vhd_validate_header(h); if (err) goto out; vhd_header_out(h); err = vhd_seek(ctx, off, SEEK_SET); if (err) goto out; err = vhd_write(ctx, h, sizeof(vhd_header_t)); out: if (err) VHDLOG("%s: failed writing header at 0x%08"PRIx64": %d\n", ctx->file, off, err); free(h); return err; } int vhd_write_header(vhd_context_t *ctx, vhd_header_t *header) { off64_t off; if (!vhd_type_dynamic(ctx)) return -EINVAL; off = ctx->footer.data_offset; return vhd_write_header_at(ctx, header, off); } int vhd_write_bat(vhd_context_t *ctx, vhd_bat_t *bat) { int err; off64_t off; vhd_bat_t b; void *buf; size_t size; if (!vhd_type_dynamic(ctx)) return -EINVAL; err = vhd_validate_bat(&ctx->bat); if (err) return err; err = vhd_validate_bat(bat); if (err) return err; memset(&b, 0, sizeof(vhd_bat_t)); off = ctx->header.table_offset; size = vhd_bytes_padded(bat->entries * sizeof(uint32_t)); err = vhd_seek(ctx, off, SEEK_SET); if (err) return err; err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) return -err; b.bat = buf; memcpy(b.bat, bat->bat, size); b.spb = bat->spb; b.entries = bat->entries; vhd_bat_out(&b); err = vhd_write(ctx, b.bat, size); free(b.bat); return err; } static int vhd_write_batmap_header(vhd_context_t *ctx, vhd_batmap_t *batmap) { int err; size_t size; off64_t off; void *buf = NULL; err = vhd_batmap_header_offset(ctx, &off); if (err) goto out; size = vhd_bytes_padded(sizeof(*batmap)); err = vhd_seek(ctx, off, SEEK_SET); if (err) goto out; err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) { err = -err; goto out; } vhd_batmap_header_out(batmap); memset(buf, 0, size); memcpy(buf, &batmap->header, sizeof(batmap->header)); err = vhd_write(ctx, buf, size); out: if (err) VHDLOG("%s: failed writing batmap: %d\n", ctx->file, err); free(buf); return err; } int vhd_write_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap) { int err; off64_t off; vhd_batmap_t b; void *buf, *map; size_t size, map_size; buf = NULL; map = NULL; if (!vhd_has_batmap(ctx)) { err = -EINVAL; goto out; } b.header = batmap->header; b.map = batmap->map; b.header.checksum = vhd_checksum_batmap(ctx, &b); err = vhd_validate_batmap(ctx, &b); if (err) goto out; off = b.header.batmap_offset; map_size = vhd_sectors_to_bytes(secs_round_up_no_zero( ctx->footer.curr_size >> (VHD_BLOCK_SHIFT + 3))); ASSERT(vhd_sectors_to_bytes(b.header.batmap_size) >= map_size); err = vhd_seek(ctx, off, SEEK_SET); if (err) goto out; err = posix_memalign(&map, VHD_SECTOR_SIZE, map_size); if (err) { map = NULL; err = -err; goto out; } memcpy(map, b.map, map_size); err = vhd_write(ctx, map, map_size); if (err) goto out; err = vhd_batmap_header_offset(ctx, &off); if (err) goto out; size = vhd_bytes_padded(sizeof(vhd_batmap_header_t)); err = vhd_seek(ctx, off, SEEK_SET); if (err) goto out; err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) { err = -err; buf = NULL; goto out; } vhd_batmap_header_out(&b); memset(buf, 0, size); memcpy(buf, &b.header, sizeof(vhd_batmap_header_t)); err = vhd_write(ctx, buf, size); out: if (err) VHDLOG("%s: failed writing batmap: %d\n", ctx->file, err); free(buf); free(map); return 0; } int vhd_write_bitmap(vhd_context_t *ctx, uint32_t block, char *bitmap) { int err; off64_t off; uint64_t blk; size_t size; if (!vhd_type_dynamic(ctx)) return -EINVAL; err = vhd_validate_bat(&ctx->bat); if (err) return err; if (block >= ctx->bat.entries) return -ERANGE; if ((unsigned long)bitmap & (VHD_SECTOR_SIZE - 1)) return -EINVAL; blk = ctx->bat.bat[block]; if (blk == DD_BLK_UNUSED) return -EINVAL; off = vhd_sectors_to_bytes(blk); size = vhd_sectors_to_bytes(ctx->bm_secs); err = vhd_seek(ctx, off, SEEK_SET); if (err) return err; err = vhd_write(ctx, bitmap, size); if (err) return err; return 0; } int vhd_write_block(vhd_context_t *ctx, uint32_t block, char *data) { int err; off64_t off; size_t size; uint64_t blk; if (!vhd_type_dynamic(ctx)) return -EINVAL; err = vhd_validate_bat(&ctx->bat); if (err) return err; if (block >= ctx->bat.entries) return -ERANGE; if ((unsigned long)data & ~(VHD_SECTOR_SIZE -1)) return -EINVAL; blk = ctx->bat.bat[block]; if (blk == DD_BLK_UNUSED) return -EINVAL; off = vhd_sectors_to_bytes(blk + ctx->bm_secs); size = vhd_sectors_to_bytes(ctx->spb); err = vhd_seek(ctx, off, SEEK_SET); if (err) return err; err = vhd_write(ctx, data, size); if (err) return err; return 0; } static inline int namedup(char **dup, const char *name) { *dup = NULL; if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN) return -ENAMETOOLONG; *dup = strdup(name); if (*dup == NULL) return -ENOMEM; return 0; } #define vwrite (ssize_t (*)(int, void *, size_t))write #define vpwrite (ssize_t (*)(int, void *, size_t, off_t))pwrite static ssize_t vhd_atomic_pio(ssize_t (*f) (int, void *, size_t, off_t), int fd, void *_s, size_t n, off_t off) { char *s = _s; size_t pos = 0; ssize_t res; struct stat st; memset(&st, 0, sizeof(st)); for (;;) { res = (f) (fd, s + pos, n - pos, off + pos); switch (res) { case -1: if (errno == EINTR || errno == EAGAIN) continue; else return 0; break; case 0: errno = EPIPE; return pos; } if (pos + res == n) return n; if (!st.st_size) if (fstat(fd, &st) == -1) return -1; if (off + pos + res == st.st_size) return pos + res; pos += (res & ~(VHD_SECTOR_SIZE - 1)); } return -1; } static ssize_t vhd_atomic_io(ssize_t (*f) (int, void *, size_t), int fd, void *_s, size_t n) { off64_t off; ssize_t res; ssize_t (*pf) (int, void *, size_t, off_t); off = lseek64(fd, 0, SEEK_CUR); if (off == (off_t)-1) return -1; pf = (f == read ? pread : vpwrite); res = vhd_atomic_pio(pf, fd, _s, n, off); if (res > 0) if (lseek64(fd, off + res, SEEK_SET) == (off64_t)-1) return -1; return res; } int vhd_seek(vhd_context_t *ctx, off64_t offset, int whence) { off64_t off; off = lseek64(ctx->fd, offset, whence); if (off == (off64_t)-1) { VHDLOG("%s: seek(0x%08"PRIx64", %d) failed: %d\n", ctx->file, offset, whence, -errno); return -errno; } return 0; } off64_t vhd_position(vhd_context_t *ctx) { return lseek64(ctx->fd, 0, SEEK_CUR); } int vhd_read(vhd_context_t *ctx, void *buf, size_t size) { size_t ret; errno = 0; ret = vhd_atomic_io(read, ctx->fd, buf, size); if (ret == size) return 0; VHDLOG("%s: read of %zu returned %zd, errno: %d\n", ctx->file, size, ret, -errno); return (errno ? -errno : -EIO); } int vhd_write(vhd_context_t *ctx, void *buf, size_t size) { size_t ret; errno = 0; ret = vhd_atomic_io(vwrite, ctx->fd, buf, size); if (ret == size) return 0; VHDLOG("%s: write of %zu returned %zd, errno: %d\n", ctx->file, size, ret, -errno); return (errno ? -errno : -EIO); } static int vhd_pread(vhd_context_t *ctx, void *buf, size_t size, off64_t offset) { ssize_t ret; errno = 0; ret = vhd_atomic_pio(pread, ctx->fd, buf, size, offset); if (ret == size) return 0; VHDLOG("%s: pread of %zu returned %zd, errno: %d\n", ctx->file, size, ret, -errno); return (errno ? -errno : -EIO); } static int vhd_pwrite(vhd_context_t *ctx, void *buf, size_t size, off64_t offset) { ssize_t ret; errno = 0; ret = vhd_atomic_pio(vpwrite, ctx->fd, buf, size, offset); if (ret == size) return 0; VHDLOG("%s: pwrite of %zu returned %zd, errno: %d\n", ctx->file, size, ret, -errno); return (errno ? -errno : -EIO); } int vhd_offset(vhd_context_t *ctx, uint32_t sector, uint32_t *offset) { int err; uint32_t block; if (!vhd_type_dynamic(ctx)) return sector; err = vhd_get_bat(ctx); if (err) return err; block = sector / ctx->spb; if (ctx->bat.bat[block] == DD_BLK_UNUSED) *offset = DD_BLK_UNUSED; else *offset = ctx->bat.bat[block] + ctx->bm_secs + (sector % ctx->spb); return 0; } int vhd_open_fast(vhd_context_t *ctx) { int err; void *buf; size_t size; size = sizeof(vhd_footer_t) + sizeof(vhd_header_t); err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) { VHDLOG("failed allocating %s: %d\n", ctx->file, -err); return -err; } err = vhd_read(ctx, buf, size); if (err) { VHDLOG("failed reading %s: %d\n", ctx->file, err); goto out; } memcpy(&ctx->footer, buf, sizeof(vhd_footer_t)); vhd_footer_in(&ctx->footer); err = vhd_validate_footer(&ctx->footer); if (err) goto out; if (vhd_type_dynamic(ctx)) { if (ctx->footer.data_offset != sizeof(vhd_footer_t)) err = vhd_read_header(ctx, &ctx->header); else { memcpy(&ctx->header, buf + sizeof(vhd_footer_t), sizeof(vhd_header_t)); vhd_header_in(&ctx->header); err = vhd_validate_header(&ctx->header); } if (err) goto out; ctx->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT; ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3); } out: free(buf); return err; } int vhd_open(vhd_context_t *ctx, const char *file, int flags) { int i, err, oflags; if (flags & VHD_OPEN_STRICT) vhd_flag_clear(flags, VHD_OPEN_FAST); memset(ctx, 0, sizeof(vhd_context_t)); vhd_cache_init(ctx); ctx->fd = -1; ctx->oflags = flags; err = namedup(&ctx->file, file); if (err) return err; oflags = O_LARGEFILE; if (!(flags & VHD_OPEN_CACHED)) oflags |= O_DIRECT; if (flags & VHD_OPEN_RDONLY) oflags |= O_RDONLY; if (flags & VHD_OPEN_RDWR) oflags |= O_RDWR; ctx->fd = open(ctx->file, oflags, 0644); if (ctx->fd == -1) { err = -errno; VHDLOG("failed to open %s: %d\n", ctx->file, err); goto fail; } err = vhd_test_file_fixed(ctx->file, &ctx->is_block); if (err) goto fail; if (flags & VHD_OPEN_FAST) { err = vhd_open_fast(ctx); if (err) goto fail; return 0; } err = vhd_read_footer(ctx, &ctx->footer); if (err) goto fail; if (!(flags & VHD_OPEN_IGNORE_DISABLED) && vhd_disabled(ctx)) { err = -EINVAL; goto fail; } if (vhd_type_dynamic(ctx)) { for (i = 0; i < VHD_HEADER_MAX_RETRIES; i++) { err = vhd_read_header(ctx, &ctx->header); if (!err) break; VHDLOG("Error reading header, retry %d\n", i); sleep(1); } if (err) goto fail; ctx->spb = ctx->header.block_size >> VHD_SECTOR_SHIFT; ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3); } err = vhd_cache_load(ctx); if (err) { VHDLOG("failed to load cache: %d\n", err); goto fail; } return 0; fail: if (ctx->fd != -1) close(ctx->fd); free(ctx->file); memset(ctx, 0, sizeof(vhd_context_t)); return err; } void vhd_close(vhd_context_t *ctx) { vhd_cache_unload(ctx); if (ctx->file) { fsync(ctx->fd); close(ctx->fd); } free(ctx->file); free(ctx->bat.bat); free(ctx->batmap.map); memset(ctx, 0, sizeof(vhd_context_t)); } static inline void vhd_initialize_footer(vhd_context_t *ctx, int type, uint64_t size) { memset(&ctx->footer, 0, sizeof(vhd_footer_t)); memcpy(ctx->footer.cookie, HD_COOKIE, sizeof(ctx->footer.cookie)); ctx->footer.features = HD_RESERVED; ctx->footer.ff_version = HD_FF_VERSION; ctx->footer.timestamp = vhd_time(time(NULL)); ctx->footer.crtr_ver = VHD_CURRENT_VERSION; ctx->footer.crtr_os = 0x00000000; ctx->footer.orig_size = size; ctx->footer.curr_size = size; ctx->footer.geometry = vhd_chs(size); ctx->footer.type = type; ctx->footer.saved = 0; ctx->footer.data_offset = 0xFFFFFFFFFFFFFFFFULL; strcpy(ctx->footer.crtr_app, "tap"); uuid_generate(ctx->footer.uuid); } int vhd_initialize_header_parent_name(vhd_context_t *ctx, const char *parent_path) { int err; iconv_t cd; size_t ibl, obl; char *pname, *ppath, *dst; err = 0; pname = NULL; ppath = NULL; /* * MICROSOFT_COMPAT * big endian unicode here */ cd = iconv_open(UTF_16BE, "ASCII"); if (cd == (iconv_t)-1) { err = -errno; goto out; } ppath = strdup(parent_path); if (!ppath) { err = -ENOMEM; goto out; } pname = basename(ppath); if (!strcmp(pname, "")) { err = -EINVAL; goto out; } ibl = strlen(pname); obl = sizeof(ctx->header.prt_name); dst = ctx->header.prt_name; memset(dst, 0, obl); if (iconv(cd, &pname, &ibl, &dst, &obl) == (size_t)-1 || ibl) err = (errno ? -errno : -EINVAL); out: iconv_close(cd); free(ppath); return err; } static off64_t get_file_size(const char *name) { int fd; off64_t end; fd = open(name, O_LARGEFILE | O_RDONLY); if (fd == -1) { VHDLOG("unable to open '%s': %d\n", name, errno); return -errno; } end = lseek64(fd, 0, SEEK_END); close(fd); return end; } static int vhd_initialize_header(vhd_context_t *ctx, const char *parent_path, uint64_t size, int raw, uint64_t *psize) { int err; struct stat stats; vhd_context_t parent; if (!vhd_type_dynamic(ctx)) return -EINVAL; memset(&ctx->header, 0, sizeof(vhd_header_t)); memcpy(ctx->header.cookie, DD_COOKIE, sizeof(ctx->header.cookie)); ctx->header.data_offset = (uint64_t)-1; ctx->header.table_offset = VHD_SECTOR_SIZE * 3; /* 1 ftr + 2 hdr */ ctx->header.hdr_ver = DD_VERSION; ctx->header.block_size = VHD_BLOCK_SIZE; ctx->header.prt_ts = 0; ctx->header.res1 = 0; ctx->header.max_bat_size = (ctx->footer.curr_size + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT; ctx->footer.data_offset = VHD_SECTOR_SIZE; if (ctx->footer.type == HD_TYPE_DYNAMIC) return 0; err = stat(parent_path, &stats); if (err == -1) return -errno; if (raw) { ctx->header.prt_ts = vhd_time(stats.st_mtime); *psize = get_file_size(parent_path); if (!size) size = *psize; } else { err = vhd_open(&parent, parent_path, VHD_OPEN_RDONLY); if (err) return err; ctx->header.prt_ts = vhd_time(stats.st_mtime); uuid_copy(ctx->header.prt_uuid, parent.footer.uuid); *psize = parent.footer.curr_size; if (!size) size = *psize; vhd_close(&parent); } if (size < *psize) { VHDLOG("snapshot size (%"PRIu64") < parent size (%"PRIu64")\n", size, *psize); return -EINVAL; } ctx->footer.orig_size = size; ctx->footer.curr_size = size; ctx->footer.geometry = vhd_chs(size); ctx->header.max_bat_size = (size + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT; return vhd_initialize_header_parent_name(ctx, parent_path); } int vhd_write_parent_locators(vhd_context_t *ctx, const char *parent) { int i, err; off64_t off; uint32_t code; code = PLAT_CODE_NONE; if (ctx->footer.type != HD_TYPE_DIFF) return -EINVAL; off = ctx->batmap.header.batmap_offset + vhd_sectors_to_bytes(ctx->batmap.header.batmap_size); if (off & (VHD_SECTOR_SIZE - 1)) off = vhd_bytes_padded(off); for (i = 0; i < 3; i++) { switch (i) { case 0: code = PLAT_CODE_MACX; break; case 1: code = PLAT_CODE_W2KU; break; case 2: code = PLAT_CODE_W2RU; break; } err = vhd_parent_locator_write_at(ctx, parent, off, code, 0, ctx->header.loc + i); if (err) return err; off += vhd_parent_locator_size(ctx->header.loc + i); } return 0; } int vhd_change_parent(vhd_context_t *child, char *parent_path, int raw) { int i, err; char *ppath; struct stat stats; vhd_context_t parent; char __parent_path[PATH_MAX]; ppath = realpath(parent_path, __parent_path); if (!ppath) { VHDLOG("error resolving parent path %s for %s: %d\n", parent_path, child->file, errno); return -errno; } err = stat(ppath, &stats); if (err == -1) { err = -errno; goto out; } if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) { err = -EINVAL; goto out; } if (raw) { uuid_clear(child->header.prt_uuid); } else { err = vhd_open(&parent, ppath, VHD_OPEN_RDONLY); if (err) { VHDLOG("error opening parent %s for %s: %d\n", ppath, child->file, err); goto out; } uuid_copy(child->header.prt_uuid, parent.footer.uuid); vhd_close(&parent); } vhd_initialize_header_parent_name(child, ppath); child->header.prt_ts = vhd_time(stats.st_mtime); for (i = 0; i < vhd_parent_locator_count(child); i++) { vhd_parent_locator_t *loc = child->header.loc + i; size_t max = vhd_parent_locator_size(loc); switch (loc->code) { case PLAT_CODE_MACX: case PLAT_CODE_W2KU: case PLAT_CODE_W2RU: break; default: continue; } err = vhd_parent_locator_write_at(child, ppath, loc->data_offset, loc->code, max, loc); if (err) { VHDLOG("error writing parent locator %d for %s: %d\n", i, child->file, err); goto out; } } TEST_FAIL_AT(FAIL_REPARENT_LOCATOR); err = vhd_write_header(child, &child->header); if (err) { VHDLOG("error writing header for %s: %d\n", child->file, err); goto out; } err = 0; out: return err; } static int vhd_create_batmap(vhd_context_t *ctx) { off64_t off; int err, map_bytes; vhd_batmap_header_t *header; void *map; if (!vhd_type_dynamic(ctx)) return -EINVAL; map_bytes = (ctx->header.max_bat_size + 7) >> 3; header = &ctx->batmap.header; memset(header, 0, sizeof(vhd_batmap_header_t)); memcpy(header->cookie, VHD_BATMAP_COOKIE, sizeof(header->cookie)); err = vhd_batmap_header_offset(ctx, &off); if (err) return err; header->batmap_offset = off + vhd_bytes_padded(sizeof(vhd_batmap_header_t)); header->batmap_size = secs_round_up_no_zero(map_bytes); header->batmap_version = VHD_BATMAP_CURRENT_VERSION; map_bytes = vhd_sectors_to_bytes(header->batmap_size); err = posix_memalign(&map, VHD_SECTOR_SIZE, map_bytes); if (err) return -err; memset(map, 0, map_bytes); ctx->batmap.map = map; return vhd_write_batmap(ctx, &ctx->batmap); } static int vhd_create_bat(vhd_context_t *ctx) { int i, err; size_t size; void *bat; if (!vhd_type_dynamic(ctx)) return -EINVAL; size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t)); err = posix_memalign(&bat, VHD_SECTOR_SIZE, size); if (err) return err; ctx->bat.bat = bat; memset(ctx->bat.bat, 0, size); for (i = 0; i < ctx->header.max_bat_size; i++) ctx->bat.bat[i] = DD_BLK_UNUSED; err = vhd_seek(ctx, ctx->header.table_offset, SEEK_SET); if (err) return err; ctx->bat.entries = ctx->header.max_bat_size; ctx->bat.spb = ctx->header.block_size >> VHD_SECTOR_SHIFT; return vhd_write_bat(ctx, &ctx->bat); } static int vhd_initialize_fixed_disk(vhd_context_t *ctx) { char *buf; int i, err; if (ctx->footer.type != HD_TYPE_FIXED) return -EINVAL; err = vhd_seek(ctx, 0, SEEK_SET); if (err) return err; buf = mmap(0, VHD_BLOCK_SIZE, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (buf == MAP_FAILED) return -errno; for (i = 0; i < ctx->footer.curr_size >> VHD_BLOCK_SHIFT; i++) { err = vhd_write(ctx, buf, VHD_BLOCK_SIZE); if (err) goto out; } err = 0; out: munmap(buf, VHD_BLOCK_SIZE); return err; } int vhd_get_phys_size(vhd_context_t *ctx, off64_t *size) { int err; if ((err = vhd_end_of_data(ctx, size))) return err; *size += sizeof(vhd_footer_t); return 0; } int vhd_set_phys_size(vhd_context_t *ctx, off64_t size) { off64_t phys_size; int err; err = vhd_get_phys_size(ctx, &phys_size); if (err) return err; if (size < phys_size) { // would result in data loss VHDLOG("ERROR: new size (%"PRIu64") < phys size (%"PRIu64")\n", size, phys_size); return -EINVAL; } return vhd_write_footer_at(ctx, &ctx->footer, size - sizeof(vhd_footer_t)); } static int vhd_set_virt_size_no_write(vhd_context_t *ctx, uint64_t size) { if ((size >> VHD_BLOCK_SHIFT) > ctx->header.max_bat_size) { VHDLOG("not enough metadata space reserved for fast " "resize (BAT size %u, need %"PRIu64")\n", ctx->header.max_bat_size, size >> VHD_BLOCK_SHIFT); return -EINVAL; } /* update footer */ ctx->footer.curr_size = size; ctx->footer.geometry = vhd_chs(ctx->footer.curr_size); ctx->footer.checksum = vhd_checksum_footer(&ctx->footer); return 0; } int vhd_set_virt_size(vhd_context_t *ctx, uint64_t size) { int err; err = vhd_set_virt_size_no_write(ctx, size); if (err) return err; return vhd_write_footer(ctx, &ctx->footer); } static int __vhd_create(const char *name, const char *parent, uint64_t bytes, int type, uint64_t mbytes, vhd_flag_creat_t flags) { int err; off64_t off; vhd_context_t ctx; uint64_t size, psize, blks; switch (type) { case HD_TYPE_DIFF: if (!parent) return -EINVAL; case HD_TYPE_FIXED: case HD_TYPE_DYNAMIC: break; default: return -EINVAL; } if (strnlen(name, VHD_MAX_NAME_LEN - 1) == VHD_MAX_NAME_LEN - 1) return -ENAMETOOLONG; if (bytes && mbytes && mbytes < bytes) return -EINVAL; memset(&ctx, 0, sizeof(vhd_context_t)); psize = 0; blks = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT; /* If mbytes is provided (virtual-size-for-metadata-preallocation), * create the VHD of size mbytes, which will create the BAT & the * batmap of the appropriate size. Once the BAT & batmap are * initialized, reset the virtual size to the requested one. */ if (mbytes) blks = (mbytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT; size = blks << VHD_BLOCK_SHIFT; ctx.fd = open(name, O_WRONLY | O_CREAT | O_TRUNC | O_LARGEFILE | O_DIRECT, 0644); if (ctx.fd == -1) return -errno; ctx.file = strdup(name); if (!ctx.file) { err = -ENOMEM; goto out; } err = vhd_test_file_fixed(ctx.file, &ctx.is_block); if (err) goto out; vhd_initialize_footer(&ctx, type, size); if (type == HD_TYPE_FIXED) { err = vhd_initialize_fixed_disk(&ctx); if (err) goto out; } else { int raw = vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW); err = vhd_initialize_header(&ctx, parent, size, raw, &psize); if (err) goto out; err = vhd_create_batmap(&ctx); if (err) goto out; err = vhd_create_bat(&ctx); if (err) goto out; if (type == HD_TYPE_DIFF) { err = vhd_write_parent_locators(&ctx, parent); if (err) goto out; } } if (mbytes) { /* set the virtual size to the requested size */ if (bytes) { blks = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT; size = blks << VHD_BLOCK_SHIFT; } else { size = psize; } ctx.footer.orig_size = size; err = vhd_set_virt_size_no_write(&ctx, size); if (err) goto out; } if (type != HD_TYPE_FIXED) { err = vhd_write_footer_at(&ctx, &ctx.footer, 0); if (err) goto out; err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE); if (err) goto out; } err = vhd_seek(&ctx, 0, SEEK_END); if (err) goto out; off = vhd_position(&ctx); if (off == (off64_t)-1) { err = -errno; goto out; } if (ctx.is_block) off -= sizeof(vhd_footer_t); err = vhd_write_footer_at(&ctx, &ctx.footer, off); if (err) goto out; err = 0; out: vhd_close(&ctx); if (err && !ctx.is_block) unlink(name); return err; } int vhd_create(const char *name, uint64_t bytes, int type, uint64_t mbytes, vhd_flag_creat_t flags) { return __vhd_create(name, NULL, bytes, type, mbytes, flags); } int vhd_snapshot(const char *name, uint64_t bytes, const char *parent, uint64_t mbytes, vhd_flag_creat_t flags) { return __vhd_create(name, parent, bytes, HD_TYPE_DIFF, mbytes, flags); } static int __vhd_io_fixed_read(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs) { int err; err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET); if (err) return err; return vhd_read(ctx, buf, vhd_sectors_to_bytes(secs)); } static void __vhd_io_dynamic_copy_data(vhd_context_t *ctx, char *map, int map_off, char *bitmap, int bitmap_off, char *dst, char *src, int secs) { int i; for (i = 0; i < secs; i++) { if (test_bit(map, map_off + i)) goto next; if (ctx && !vhd_bitmap_test(ctx, bitmap, bitmap_off + i)) goto next; memcpy(dst, src, VHD_SECTOR_SIZE); set_bit(map, map_off + i); next: src += VHD_SECTOR_SIZE; dst += VHD_SECTOR_SIZE; } } static int __vhd_io_dynamic_read_link(vhd_context_t *ctx, char *map, char *buf, uint64_t sector, uint32_t secs) { off64_t off; uint32_t blk, sec; int err, cnt, map_off; char *bitmap, *data, *src; map_off = 0; do { blk = sector / ctx->spb; sec = sector % ctx->spb; off = ctx->bat.bat[blk]; data = NULL; bitmap = NULL; if (off == DD_BLK_UNUSED) { cnt = MIN(secs, ctx->spb); goto next; } err = vhd_read_bitmap(ctx, blk, &bitmap); if (err) return err; err = vhd_read_block(ctx, blk, &data); if (err) { free(bitmap); return err; } cnt = MIN(secs, ctx->spb - sec); src = data + vhd_sectors_to_bytes(sec); __vhd_io_dynamic_copy_data(ctx, map, map_off, bitmap, sec, buf, src, cnt); next: free(data); free(bitmap); secs -= cnt; sector += cnt; map_off += cnt; buf += vhd_sectors_to_bytes(cnt); } while (secs); return 0; } static int __raw_read_link(char *filename, char *map, char *buf, uint64_t sec, uint32_t secs) { int fd, err; off64_t off; uint64_t size; void *data; err = 0; errno = 0; fd = open(filename, O_RDONLY | O_DIRECT | O_LARGEFILE); if (fd == -1) { VHDLOG("%s: failed to open: %d\n", filename, -errno); return -errno; } off = lseek64(fd, vhd_sectors_to_bytes(sec), SEEK_SET); if (off == (off64_t)-1) { VHDLOG("%s: seek(0x%08"PRIx64") failed: %d\n", filename, vhd_sectors_to_bytes(sec), -errno); err = -errno; goto close; } size = vhd_sectors_to_bytes(secs); err = posix_memalign(&data, VHD_SECTOR_SIZE, size); if (err) goto close; err = read(fd, data, size); if (err != size) { VHDLOG("%s: reading of %"PRIu64" returned %d, errno: %d\n", filename, size, err, -errno); free(data); err = errno ? -errno : -EIO; goto close; } __vhd_io_dynamic_copy_data(NULL, map, 0, NULL, 0, buf, data, secs); free(data); err = 0; close: close(fd); return err; } static int __vhd_io_dynamic_read(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs) { int err; uint32_t i, done; char *map, *next; vhd_context_t parent, *vhd; err = vhd_get_bat(ctx); if (err) return err; vhd = ctx; next = NULL; map = calloc(1, secs << (VHD_SECTOR_SHIFT - 3)); if (!map) return -ENOMEM; memset(buf, 0, vhd_sectors_to_bytes(secs)); for (;;) { err = __vhd_io_dynamic_read_link(vhd, map, buf, sec, secs); if (err) goto close; for (done = 0, i = 0; i < secs; i++) if (test_bit(map, i)) done++; if (done == secs) { err = 0; goto close; } if (vhd->footer.type == HD_TYPE_DIFF) { vhd_context_t *p; p = vhd_cache_get_parent(vhd); if (p) { vhd = p; err = vhd_get_bat(vhd); if (err) goto out; continue; } err = vhd_parent_locator_get(vhd, &next); if (err) goto close; if (vhd_parent_raw(vhd)) { err = __raw_read_link(next, map, buf, sec, secs); goto close; } } else { err = 0; goto close; } if (vhd != ctx) vhd_close(vhd); vhd = &parent; err = vhd_open(vhd, next, VHD_OPEN_RDONLY); if (err) goto out; err = vhd_get_bat(vhd); if (err) goto close; free(next); next = NULL; } close: if (vhd != ctx && !vhd_flag_test(vhd->oflags, VHD_OPEN_CACHED)) vhd_close(vhd); out: free(map); free(next); return err; } int vhd_io_read(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs) { if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size) return -ERANGE; if (!vhd_type_dynamic(ctx)) return __vhd_io_fixed_read(ctx, buf, sec, secs); return __vhd_io_dynamic_read(ctx, buf, sec, secs); } static int __vhd_io_fixed_write(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs) { int err; err = vhd_seek(ctx, vhd_sectors_to_bytes(sec), SEEK_SET); if (err) return err; return vhd_write(ctx, buf, vhd_sectors_to_bytes(secs)); } static int __vhd_io_allocate_block(vhd_context_t *ctx, uint32_t block) { char *buf; size_t size; off64_t off, max; int err, gap, spp, secs; spp = getpagesize() >> VHD_SECTOR_SHIFT; err = vhd_end_of_data(ctx, &max); if (err) return err; gap = 0; off = max; max >>= VHD_SECTOR_SHIFT; /* data region of segment should begin on page boundary */ if ((max + ctx->bm_secs) % spp) { gap = (spp - ((max + ctx->bm_secs) % spp)); max += gap; } err = vhd_seek(ctx, off, SEEK_SET); if (err) return err; secs = ctx->bm_secs + gap; if (!vhd_flag_test(ctx->oflags, VHD_OPEN_IO_WRITE_SPARSE)) secs += ctx->spb; size = vhd_sectors_to_bytes(secs); buf = mmap(0, size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (buf == MAP_FAILED) return -errno; err = vhd_write(ctx, buf, size); if (err) goto out; ctx->bat.bat[block] = max; err = vhd_write_bat(ctx, &ctx->bat); if (err) goto out; err = 0; out: munmap(buf, size); return err; } static int __vhd_io_dynamic_write(vhd_context_t *ctx, char *buf, uint64_t sector, uint32_t secs) { char *map; off64_t off; uint32_t blk, sec; int i, err, cnt, ret; if (vhd_sectors_to_bytes(sector + secs) > ctx->footer.curr_size) return -ERANGE; err = vhd_get_bat(ctx); if (err) return err; if (vhd_has_batmap(ctx)) { err = vhd_get_batmap(ctx); if (err) return err; } do { blk = sector / ctx->spb; sec = sector % ctx->spb; off = ctx->bat.bat[blk]; if (off == DD_BLK_UNUSED) { err = __vhd_io_allocate_block(ctx, blk); if (err) return err; off = ctx->bat.bat[blk]; } off += ctx->bm_secs + sec; err = vhd_seek(ctx, vhd_sectors_to_bytes(off), SEEK_SET); if (err) return err; cnt = MIN(secs, ctx->spb - sec); err = vhd_write(ctx, buf, vhd_sectors_to_bytes(cnt)); if (err) return err; if (vhd_has_batmap(ctx) && vhd_batmap_test(ctx, &ctx->batmap, blk)) goto next; err = vhd_read_bitmap(ctx, blk, &map); if (err) return err; for (i = 0; i < cnt; i++) vhd_bitmap_set(ctx, map, sec + i); err = vhd_write_bitmap(ctx, blk, map); if (err) goto fail; if (vhd_has_batmap(ctx)) { for (i = 0; i < ctx->spb; i++) if (!vhd_bitmap_test(ctx, map, i)) { free(map); goto next; } vhd_batmap_set(ctx, &ctx->batmap, blk); err = vhd_write_batmap(ctx, &ctx->batmap); if (err) goto fail; } free(map); map = NULL; next: secs -= cnt; sector += cnt; buf += vhd_sectors_to_bytes(cnt); } while (secs); err = 0; out: ret = vhd_write_footer(ctx, &ctx->footer); return (err ? err : ret); fail: free(map); goto out; } int vhd_io_write(vhd_context_t *ctx, char *buf, uint64_t sec, uint32_t secs) { if (vhd_sectors_to_bytes(sec + secs) > ctx->footer.curr_size) return -ERANGE; if (!vhd_type_dynamic(ctx)) return __vhd_io_fixed_write(ctx, buf, sec, secs); return __vhd_io_dynamic_write(ctx, buf, sec, secs); } static void vhd_cache_init(vhd_context_t *ctx) { INIT_LIST_HEAD(&ctx->next); } static int vhd_cache_enabled(vhd_context_t *ctx) { return vhd_flag_test(ctx->oflags, VHD_OPEN_CACHED); } static int vhd_cache_load(vhd_context_t *ctx) { char *next; int err, pflags; vhd_context_t *vhd; err = 1; pflags = ctx->oflags; vhd = ctx; next = NULL; vhd_flag_set(pflags, VHD_OPEN_RDONLY); vhd_flag_clear(pflags, VHD_OPEN_CACHED); if (!vhd_cache_enabled(vhd)) goto done; while (vhd->footer.type == HD_TYPE_DIFF) { vhd_context_t *parent; parent = NULL; if (vhd_parent_raw(vhd)) goto done; err = vhd_parent_locator_get(vhd, &next); if (err) goto out; parent = calloc(1, sizeof(*parent)); if (!parent) goto out; err = vhd_open(parent, next, pflags); if (err) { free(parent); parent = NULL; goto out; } fcntl(parent->fd, F_SETFL, fcntl(parent->fd, F_GETFL) & ~O_DIRECT); vhd_flag_set(parent->oflags, VHD_OPEN_CACHED); list_add(&parent->next, &vhd->next); free(next); next = NULL; vhd = parent; } done: err = 0; out: free(next); if (err) vhd_cache_unload(vhd); return err; } static int vhd_cache_unload(vhd_context_t *ctx) { vhd_context_t *vhd, *tmp; if (!vhd_cache_enabled(ctx)) goto out; list_for_each_entry_safe(vhd, tmp, &ctx->next, next) { list_del_init(&vhd->next); vhd_close(vhd); free(vhd); } INIT_LIST_HEAD(&ctx->next); out: return 0; } static vhd_context_t * vhd_cache_get_parent(vhd_context_t *ctx) { vhd_context_t *vhd; vhd = NULL; if (!vhd_cache_enabled(ctx)) goto out; if (list_empty(&ctx->next)) goto out; vhd = list_entry(ctx->next.next, vhd_context_t, next); out: return vhd; } typedef struct vhd_block_vector vhd_block_vector_t; typedef struct vhd_block_vector_entry vhd_block_vector_entry_t; struct vhd_block_vector_entry { uint64_t off; /* byte offset from block */ uint32_t bytes; /* size in bytes */ char *buf; /* destination buffer */ }; struct vhd_block_vector { uint32_t block; /* logical block in vhd */ int entries; /* number of vector entries */ vhd_block_vector_entry_t *array; /* vector list */ }; /** * @vec: block vector describing read * * @vec describes a list of byte-spans within a given block * and a corresponding list of destination buffers. */ static int vhd_block_vector_read(vhd_context_t *ctx, vhd_block_vector_t *vec) { int err, i; off64_t off; uint32_t blk; err = vhd_get_bat(ctx); if (err) goto out; if (vec->block >= ctx->bat.entries) { err = -ERANGE; goto out; } blk = ctx->bat.bat[vec->block]; if (blk == DD_BLK_UNUSED) { err = -EINVAL; goto out; } off = vhd_sectors_to_bytes(blk + ctx->bm_secs); for (i = 0; i < vec->entries; i++) { vhd_block_vector_entry_t *v = vec->array + i; err = vhd_pread(ctx, v->buf, v->bytes, off + v->off); if (err) goto out; } out: return err; } /** * @vec: block vector to initialize * @block: vhd block number * @map: optional bitmap of sectors to map (relative to beginning of block) * @buf: destination buffer * @blk_start: byte offset relative to beginning of block * @blk_end: byte offset relative to beginning of block * * initializes @vec to describe a read into a contiguous buffer * of potentially non-contiguous byte ranges in a given vhd block. * only sectors with corresponding bits set in @map (if it is not NULL) * will be mapped; bits corresponding to unmapped sectors will be cleared. * first and last sector maps may be smaller than vhd sector size. */ static int vhd_block_vector_init(vhd_context_t *ctx, vhd_block_vector_t *vec, uint32_t block, char *map, char *buf, uint64_t blk_start, uint64_t blk_end) { int err, sec; char *bitmap; uint32_t first_sec, last_sec; bitmap = NULL; memset(vec, 0, sizeof(*vec)); first_sec = blk_start >> VHD_SECTOR_SHIFT; last_sec = secs_round_up_no_zero(blk_end); err = vhd_read_bitmap(ctx, block, &bitmap); if (err) goto out; vec->array = calloc(ctx->spb, sizeof(vhd_block_vector_entry_t)); if (!vec->array) { err = -ENOMEM; goto out; } for (sec = first_sec; sec < last_sec; sec++) { uint32_t cnt; vhd_block_vector_entry_t *v; cnt = VHD_SECTOR_SIZE - (blk_start & (VHD_SECTOR_SIZE - 1)); if (cnt > blk_end - blk_start) cnt = blk_end - blk_start; if (map && !test_bit(map, sec)) goto next; if (vhd_bitmap_test(ctx, bitmap, sec)) { if (vec->entries > 0) { v = vec->array + vec->entries - 1; if (v->off + v->bytes == blk_start) { v->bytes += cnt; goto next; } } v = vec->array + vec->entries; v->off = blk_start; v->bytes = cnt; v->buf = buf; vec->entries++; } else if (map) { clear_bit(map, sec); } next: blk_start += cnt; buf += cnt; } vec->block = block; out: free(bitmap); return err; } #if 0 /** * @block: vhd block number * @buf: buffer to place data in * @size: number of bytes to read * @start: byte offset into block from which to start reading * @end: byte offset in block at which to stop reading * * reads data (if it exists) into @buf. partial reads may occur * for the first and last sectors if @start and @end are not multiples * of vhd sector size. */ static int vhd_block_vector_read_allocated(vhd_context_t *ctx, uint32_t block, char *buf, uint64_t start, uint64_t end) { int err; vhd_block_vector_t vec; vec.array = NULL; err = vhd_block_vector_init(ctx, &vec, block, NULL, buf, start, end); if (err) goto out; err = vhd_block_vector_read(ctx, &vec); out: free(vec.array); return err; } #endif /** * @block: vhd block number * @map: bitmap of sectors in block which should be read * @buf: buffer to place data in * @start: byte offset into block from which to start reading * @end: byte offset in block at which to stop reading * * for every bit set in @map (corresponding to sectors in @block), * reads data (if it exists) into @buf. if data does not exist, * clears corresponding bit in @map. partial reads may occur * for the first and last sectors if @start and @end are not multiples * of vhd sector size. */ static int vhd_block_vector_read_allocated_selective(vhd_context_t *ctx, uint32_t block, char *map, char *buf, uint64_t start, uint64_t end) { int err; vhd_block_vector_t vec; vec.array = NULL; err = vhd_block_vector_init(ctx, &vec, block, map, buf, start, end); if (err) goto out; err = vhd_block_vector_read(ctx, &vec); out: free(vec.array); return err; } /** * @map: bitmap of sectors which have already been read * @buf: destination buffer * @size: size in bytes to read * @off: byte offset in virtual disk to read * * reads @size bytes into @buf, starting at @off, skipping sectors * which have corresponding bits set in @map */ static int __vhd_io_dynamic_read_link_bytes(vhd_context_t *ctx, char *map, char *buf, size_t size, uint64_t off) { char *blkmap; int i, err, map_off; off64_t blk_off, blk_size; uint32_t blk, bytes, first_sec, last_sec; blkmap = malloc((ctx->spb + 7) >> 3); if (!blkmap) { err = -ENOMEM; goto out; } map_off = 0; blk_size = vhd_sectors_to_bytes(ctx->spb); do { blk = off / blk_size; blk_off = off % blk_size; bytes = MIN(blk_size - blk_off, size); first_sec = blk_off >> VHD_SECTOR_SHIFT; last_sec = secs_round_up_no_zero(blk_off + bytes); if (ctx->bat.bat[blk] == DD_BLK_UNUSED) goto next; memset(blkmap, 0, (ctx->spb + 7) >> 3); for (i = 0; i < (last_sec - first_sec); i++) if (!test_bit(map, map_off + i)) set_bit(blkmap, first_sec + i); err = vhd_block_vector_read_allocated_selective(ctx, blk, blkmap, buf, blk_off, blk_off + bytes); if (err) goto out; for (i = 0; i < (last_sec - first_sec); i++) if (test_bit(blkmap, first_sec + i)) set_bit(map, map_off + i); next: size -= bytes; off += bytes; map_off += (last_sec - first_sec); buf += bytes; } while (size); err = 0; out: free(blkmap); return err; } static int __raw_read_link_bytes(const char *filename, char *map, char *buf, size_t size, uint64_t off) { int fd, err; uint32_t i, first_sec, last_sec; fd = open(filename, O_RDONLY | O_LARGEFILE); if (fd == -1) { VHDLOG("%s: failed to open: %d\n", filename, -errno); return -errno; } first_sec = off >> VHD_SECTOR_SHIFT; last_sec = secs_round_up_no_zero(off + size); for (i = first_sec; i < last_sec; i++) { if (!test_bit(map, i - first_sec)) { uint32_t secs = 0; uint64_t coff, csize; while (i + secs < last_sec && !test_bit(map, i + secs - first_sec)) secs++; coff = vhd_sectors_to_bytes(i); csize = vhd_sectors_to_bytes(secs); if (i == first_sec) coff = off; if (secs == last_sec - 1) csize = (off + size) - coff; if (pread(fd, buf + coff - off, csize, coff) != csize) { err = (errno ? -errno : -EIO); goto close; } i += secs - 1; } } err = 0; close: close(fd); return err; } static int __vhd_io_dynamic_read_bytes(vhd_context_t *ctx, char *buf, size_t size, uint64_t off) { int err; char *next, *map; vhd_context_t parent, *vhd; uint32_t i, done, first_sec, last_sec; err = vhd_get_bat(ctx); if (err) return err; first_sec = off >> VHD_SECTOR_SHIFT; last_sec = secs_round_up_no_zero(off + size); vhd = ctx; next = NULL; map = calloc(1, ((last_sec - first_sec) + 7) >> 3); if (!map) { err = -ENOMEM; goto out; } for (;;) { err = __vhd_io_dynamic_read_link_bytes(vhd, map, buf, size, off); if (err) goto close; for (done = 0, i = 0; i < (last_sec - first_sec); i++) if (test_bit(map, i)) done++; if (done == last_sec - first_sec) { err = 0; goto close; } if (vhd->footer.type == HD_TYPE_DIFF) { vhd_context_t *p; p = vhd_cache_get_parent(vhd); if (p) { vhd = p; err = vhd_get_bat(vhd); if (err) goto out; continue; } err = vhd_parent_locator_get(vhd, &next); if (err) goto close; if (vhd_parent_raw(vhd)) { err = __raw_read_link_bytes(next, map, buf, size, off); goto close; } } else { err = 0; goto close; } if (vhd != ctx) vhd_close(vhd); vhd = &parent; err = vhd_open(vhd, next, VHD_OPEN_RDONLY); if (err) goto out; err = vhd_get_bat(vhd); if (err) goto close; free(next); next = NULL; } close: if (!err) { /* * clear any regions not present on disk */ for (i = first_sec; i < last_sec; i++) { if (!test_bit(map, i - first_sec)) { uint64_t coff = vhd_sectors_to_bytes(i); uint32_t csize = VHD_SECTOR_SIZE; if (i == first_sec) coff = off; if (i == last_sec - 1) csize = (off + size) - coff; memset(buf + coff - off, 0, csize); } } } if (vhd != ctx && !vhd_flag_test(vhd->oflags, VHD_OPEN_CACHED)) vhd_close(vhd); out: free(map); free(next); return err; } int vhd_io_read_bytes(vhd_context_t *ctx, void *buf, size_t size, uint64_t off) { if (off + size > ctx->footer.curr_size) return -ERANGE; if (!vhd_type_dynamic(ctx)) return vhd_pread(ctx, buf, size, off); return __vhd_io_dynamic_read_bytes(ctx, buf, size, off); } static int __vhd_io_dynamic_write_bytes_aligned(vhd_context_t *ctx, char *buf, size_t size, uint64_t off) { char *map; int i, err, ret; uint64_t blk_off, blk_size, blk_start; uint32_t blk, bytes, first_sec, last_sec; if (off & (VHD_SECTOR_SIZE - 1) || size & (VHD_SECTOR_SIZE - 1)) return -EINVAL; err = vhd_get_bat(ctx); if (err) return err; if (vhd_has_batmap(ctx)) { err = vhd_get_batmap(ctx); if (err) return err; } map = NULL; blk_size = vhd_sectors_to_bytes(ctx->spb); do { blk = off / blk_size; blk_off = off % blk_size; bytes = MIN(blk_size - blk_off, size); first_sec = blk_off >> VHD_SECTOR_SHIFT; last_sec = secs_round_up_no_zero(blk_off + bytes); blk_start = ctx->bat.bat[blk]; if (blk_start == DD_BLK_UNUSED) { err = __vhd_io_allocate_block(ctx, blk); if (err) goto fail; blk_start = ctx->bat.bat[blk]; } blk_start = vhd_sectors_to_bytes(blk_start + ctx->bm_secs); err = vhd_pwrite(ctx, buf, bytes, blk_start + blk_off); if (err) goto fail; if (vhd_has_batmap(ctx) && vhd_batmap_test(ctx, &ctx->batmap, blk)) goto next; err = vhd_read_bitmap(ctx, blk, &map); if (err) { map = NULL; goto fail; } for (i = first_sec; i < last_sec; i++) vhd_bitmap_set(ctx, map, i); err = vhd_write_bitmap(ctx, blk, map); if (err) goto fail; if (vhd_has_batmap(ctx)) { for (i = 0; i < ctx->spb; i++) if (!vhd_bitmap_test(ctx, map, i)) { free(map); map = NULL; goto next; } vhd_batmap_set(ctx, &ctx->batmap, blk); err = vhd_write_batmap(ctx, &ctx->batmap); if (err) goto fail; } free(map); map = NULL; next: size -= bytes; off += bytes; buf += bytes; } while (size); err = 0; out: ret = vhd_write_footer(ctx, &ctx->footer); return (err ? err : ret); fail: free(map); goto out; } static int __vhd_io_dynamic_write_bytes(vhd_context_t *ctx, char *buf, size_t size, uint64_t off) { int err; char *tmp; uint32_t first_sec, last_sec, first_sec_off, last_sec_off; err = 0; tmp = NULL; first_sec = off >> VHD_SECTOR_SHIFT; last_sec = secs_round_up_no_zero(off + size); first_sec_off = off & (VHD_SECTOR_SIZE - 1); last_sec_off = (off + size) & (VHD_SECTOR_SIZE - 1); if (first_sec_off || last_sec_off) { tmp = malloc(VHD_SECTOR_SIZE); if (!tmp) { err = -ENOMEM; goto out; } if (first_sec_off) { uint32_t new = VHD_SECTOR_SIZE - first_sec_off; if (new > size) new = size; err = vhd_io_read_bytes( ctx, tmp, VHD_SECTOR_SIZE, vhd_sectors_to_bytes(first_sec)); if (err) goto out; memcpy(tmp + first_sec_off, buf, new); err = __vhd_io_dynamic_write_bytes_aligned( ctx, tmp, VHD_SECTOR_SIZE, vhd_sectors_to_bytes(first_sec)); if (err) goto out; buf += new; off += new; size -= new; } if (last_sec_off && (last_sec - first_sec > 1 || !first_sec_off)) { uint32_t new = last_sec_off; err = vhd_io_read_bytes( ctx, tmp, VHD_SECTOR_SIZE, vhd_sectors_to_bytes(last_sec - 1)); if (err) goto out; memcpy(tmp, buf + size - new, new); err = __vhd_io_dynamic_write_bytes_aligned( ctx, tmp, VHD_SECTOR_SIZE, vhd_sectors_to_bytes(last_sec - 1)); if (err) goto out; size -= new; } } if (size) err = __vhd_io_dynamic_write_bytes_aligned(ctx, buf, size, off); out: free(tmp); return err; } int vhd_io_write_bytes(vhd_context_t *ctx, void *buf, size_t size, uint64_t off) { if (off + size > ctx->footer.curr_size) return -ERANGE; if (!vhd_type_dynamic(ctx)) return vhd_pwrite(ctx, buf, size, off); return __vhd_io_dynamic_write_bytes(ctx, buf, size, off); } int vhd_marker(vhd_context_t *ctx, char *marker) { int err; vhd_batmap_t batmap; *marker = 0; if (!vhd_has_batmap(ctx)) return -ENOSYS; err = vhd_read_batmap_header(ctx, &batmap); if (err) return err; *marker = batmap.header.marker; return 0; } int vhd_set_marker(vhd_context_t *ctx, char marker) { int err; vhd_batmap_t batmap; if (!vhd_has_batmap(ctx)) return -ENOSYS; err = vhd_read_batmap_header(ctx, &batmap); if (err) return err; batmap.header.marker = marker; return vhd_write_batmap_header(ctx, &batmap); } blktap-2.0.90/vhd/lib/vhd-util-query.c0000644000000000000000000001063111664745551016166 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "libvhd.h" int vhd_util_query(int argc, char **argv) { char *name; vhd_context_t vhd; off64_t currsize; int ret, err, c, size, physize, parent, fields, depth, fastresize, marker; name = NULL; size = 0; physize = 0; parent = 0; fields = 0; depth = 0; fastresize = 0; marker = 0; if (!argc || !argv) { err = -EINVAL; goto usage; } optind = 0; while ((c = getopt(argc, argv, "n:vspfdSmh")) != -1) { switch (c) { case 'n': name = optarg; break; case 'v': size = 1; break; case 's': physize = 1; break; case 'p': parent = 1; break; case 'f': fields = 1; break; case 'd': depth = 1; break; case 'S': fastresize = 1; break; case 'm': marker = 1; break; case 'h': err = 0; goto usage; default: err = -EINVAL; goto usage; } } if (!name || optind != argc) { err = -EINVAL; goto usage; } err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED); if (err) { printf("error opening %s: %d\n", name, err); return err; } if (size) printf("%"PRIu64"\n", vhd.footer.curr_size >> 20); if (physize) { err = vhd_get_phys_size(&vhd, &currsize); if (err) printf("failed to get physical size: %d\n", err); else printf("%"PRIu64"\n", currsize); } if (parent) { ret = 0; if (vhd.footer.type != HD_TYPE_DIFF) printf("%s has no parent\n", name); else { char *pname; ret = vhd_parent_locator_get(&vhd, &pname); if (ret) printf("query failed\n"); else { printf("%s\n", pname); free(pname); } } err = (err ? : ret); } if (fields) { int hidden; ret = vhd_hidden(&vhd, &hidden); if (ret) printf("error checking 'hidden' field: %d\n", ret); else printf("hidden: %d\n", hidden); err = (err ? : ret); } if (marker) { char marker; ret = vhd_marker(&vhd, &marker); if (ret) printf("error checking 'marker' field: %d\n", ret); else printf("marker: %d\n", marker); err = (err ? : ret); } if (depth) { int length; ret = vhd_chain_depth(&vhd, &length); if (ret) printf("error checking chain depth: %d\n", ret); else printf("chain depth: %d\n", length); err = (err ? : ret); } if (fastresize) { uint64_t max_size; max_size = vhd.header.max_bat_size << (VHD_BLOCK_SHIFT - 20); printf("%"PRIu64"\n", max_size); } vhd_close(&vhd); return err; usage: printf("options: <-n name> [-v print virtual size (in MB)] " "[-s print physical utilization (bytes)] [-p print parent] " "[-f print fields] [-m print marker] [-d print chain depth] " "[-S print max virtual size (MB) for fast resize] [-h help]\n"); return err; } blktap-2.0.90/vhd/lib/vhd-util-resize.c0000644000000000000000000006232011664745551016324 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include "libvhd-journal.h" #if 1 #define DFPRINTF(_f, _a...) fprintf(stdout, _f, ##_a) #else #define DFPRINTF(_f, _a...) ((void)0) #endif #define EPRINTF(_f, _a...) \ do { \ syslog(LOG_INFO, "%s: " _f, __func__, ##_a); \ DFPRINTF(_f, _a); \ } while (0) typedef struct vhd_block { uint32_t block; uint32_t offset; } vhd_block_t; TEST_FAIL_EXTERN_VARS; static inline uint32_t secs_to_blocks_down(vhd_context_t *vhd, uint64_t secs) { return secs / vhd->spb; } static uint32_t secs_to_blocks_up(vhd_context_t *vhd, uint64_t secs) { uint32_t blocks; blocks = secs / vhd->spb; if (secs % vhd->spb) blocks++; return blocks; } static int vhd_fixed_shrink(vhd_journal_t *journal, uint64_t secs) { int err; uint64_t new_eof; vhd_context_t *vhd; vhd = &journal->vhd; new_eof = vhd->footer.curr_size - vhd_sectors_to_bytes(secs); if (new_eof <= sizeof(vhd_footer_t)) return -EINVAL; err = ftruncate(vhd->fd, new_eof); if (err) return errno; vhd->footer.curr_size = new_eof; return vhd_write_footer(vhd, &vhd->footer); } static int vhd_write_zeros(vhd_journal_t *journal, off64_t off, uint64_t size) { int err; char *buf; vhd_context_t *vhd; uint64_t bytes, map; vhd = &journal->vhd; map = MIN(size, VHD_BLOCK_SIZE); err = vhd_seek(vhd, off, SEEK_SET); if (err) return err; buf = mmap(0, map, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (buf == MAP_FAILED) return -errno; do { bytes = MIN(size, map); err = vhd_write(vhd, buf, bytes); if (err) break; size -= bytes; } while (size); munmap(buf, map); return err; } static int vhd_fixed_grow(vhd_journal_t *journal, uint64_t secs) { int err; vhd_context_t *vhd; uint64_t size, eof, new_eof; size = vhd_sectors_to_bytes(secs); vhd = &journal->vhd; err = vhd_seek(vhd, 0, SEEK_END); if (err) goto out; eof = vhd_position(vhd); if (eof == (off64_t)-1) { err = -errno; goto out; } err = vhd_write_zeros(journal, eof - sizeof(vhd_footer_t), size); if (err) goto out; new_eof = eof + size; err = vhd_seek(vhd, new_eof, SEEK_SET); if (err) goto out; vhd->footer.curr_size += size; err = vhd_write_footer(vhd, &vhd->footer); if (err) goto out; err = 0; out: return err; } static int vhd_fixed_resize(vhd_journal_t *journal, uint64_t size) { int err; vhd_context_t *vhd; uint64_t cur_secs, new_secs; vhd = &journal->vhd; cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT; new_secs = size << (20 - VHD_SECTOR_SHIFT); if (cur_secs == new_secs) return 0; else if (cur_secs > new_secs) err = vhd_fixed_shrink(journal, cur_secs - new_secs); else err = vhd_fixed_grow(journal, new_secs - cur_secs); return err; } static inline void swap(vhd_block_t *list, int a, int b) { vhd_block_t tmp; tmp = list[a]; list[a] = list[b]; list[b] = tmp; } static int partition(vhd_block_t *list, int left, int right, int pidx) { int i, sidx; long long pval; sidx = left; pval = list[pidx].offset; swap(list, pidx, right); for (i = left; i < right; i++) if (list[i].offset >= pval) { swap(list, sidx, i); ++sidx; } swap(list, right, sidx); return sidx; } static void quicksort(vhd_block_t *list, int left, int right) { int pidx, new_pidx; if (right < left) return; pidx = left; new_pidx = partition(list, left, right, pidx); quicksort(list, left, new_pidx - 1); quicksort(list, new_pidx + 1, right); } static int vhd_move_block(vhd_journal_t *journal, uint32_t src, off64_t offset) { int err; char *buf; size_t size; vhd_context_t *vhd; off64_t off, src_off; buf = NULL; vhd = &journal->vhd; off = offset; size = vhd_sectors_to_bytes(vhd->bm_secs); src_off = vhd->bat.bat[src]; if (src_off == DD_BLK_UNUSED) return -EINVAL; src_off = vhd_sectors_to_bytes(src_off); err = vhd_journal_add_block(journal, src, VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA); if (err) goto out; err = vhd_read_bitmap(vhd, src, &buf); if (err) goto out; err = vhd_seek(vhd, off, SEEK_SET); if (err) goto out; err = vhd_write(vhd, buf, size); if (err) goto out; free(buf); buf = NULL; off += size; size = vhd_sectors_to_bytes(vhd->spb); err = vhd_read_block(vhd, src, &buf); if (err) goto out; err = vhd_seek(vhd, off, SEEK_SET); if (err) goto out; err = vhd_write(vhd, buf, size); if (err) goto out; vhd->bat.bat[src] = offset >> VHD_SECTOR_SHIFT; err = vhd_write_zeros(journal, src_off, vhd_sectors_to_bytes(vhd->bm_secs + vhd->spb)); out: free(buf); return err; } static int vhd_clobber_block(vhd_journal_t *journal, uint32_t src, uint32_t dest) { int err; off64_t off; vhd_context_t *vhd; vhd = &journal->vhd; off = vhd_sectors_to_bytes(vhd->bat.bat[dest]); err = vhd_journal_add_block(journal, dest, VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA); if (err) return err; err = vhd_move_block(journal, src, off); if (err) return err; vhd->bat.bat[dest] = DD_BLK_UNUSED; return 0; } /* * remove a list of blocks from the vhd file * if a block to be removed: * - resides at the end of the file: simply clear its bat entry * - resides elsewhere: move the last block in the file into its position * and update the bat to reflect this */ static int vhd_defrag_shrink(vhd_journal_t *journal, vhd_block_t *original_free_list, int free_cnt) { vhd_context_t *vhd; int i, j, free_idx, err; vhd_block_t *blocks, *free_list; err = 0; blocks = NULL; free_list = NULL; vhd = &journal->vhd; blocks = malloc(vhd->bat.entries * sizeof(vhd_block_t)); if (!blocks) { err = -ENOMEM; goto out; } free_list = malloc(free_cnt * sizeof(vhd_block_t)); if (!free_list) { err = -ENOMEM; goto out; } for (i = 0; i < vhd->bat.entries; i++) { blocks[i].block = i; blocks[i].offset = vhd->bat.bat[i]; } memcpy(free_list, original_free_list, free_cnt * sizeof(vhd_block_t)); /* sort both the to-free list and the bat list * in order of descending file offset */ quicksort(free_list, 0, free_cnt - 1); quicksort(blocks, 0, vhd->bat.entries - 1); for (i = 0, free_idx = 0; i < vhd->bat.entries && free_idx < free_cnt; i++) { vhd_block_t *b = blocks + i; if (b->offset == DD_BLK_UNUSED) continue; for (j = free_idx; j < free_cnt; j++) if (b->block == free_list[j].block) { /* the last block in the file is in the list of * blocks to remove; no need to shuffle the * data -- just clear the bat entry */ vhd->bat.bat[free_list[j].block] = DD_BLK_UNUSED; free_idx++; continue; } err = vhd_clobber_block(journal, b->block, free_list[free_idx++].block); if (err) goto out; } /* clear any bat entries for blocks we did not shuffle */ for (i = free_idx; i < free_cnt; i++) vhd->bat.bat[free_list[i].block] = DD_BLK_UNUSED; out: free(blocks); free(free_list); return err; } static int vhd_clear_bat_entries(vhd_journal_t *journal, uint32_t entries) { int i, err; vhd_context_t *vhd; off64_t orig_map_off, new_map_off; uint32_t orig_entries, new_entries; vhd = &journal->vhd; orig_entries = vhd->header.max_bat_size; new_entries = orig_entries - entries; if (vhd_has_batmap(vhd)) { err = vhd_batmap_header_offset(vhd, &orig_map_off); if (err) return err; } /* update header */ vhd->header.max_bat_size = new_entries; err = vhd_write_header(vhd, &vhd->header); if (err) return err; /* update footer */ vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size; vhd->footer.geometry = vhd_chs(vhd->footer.curr_size); err = vhd_write_footer(vhd, &vhd->footer); if (err) return err; /* update bat -- we don't reclaim space, just clear entries */ for (i = new_entries; i < orig_entries; i++) vhd->bat.bat[i] = 0; err = vhd_write_bat(vhd, &vhd->bat); if (err) return err; /* update this after write_bat so the end of the bat is zeored */ vhd->bat.entries = new_entries; if (!vhd_has_batmap(vhd)) return 0; /* zero out old batmap header if new header has moved */ err = vhd_batmap_header_offset(vhd, &new_map_off); if (err) return err; if (orig_map_off != new_map_off) { size_t size; size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr)); err = vhd_write_zeros(journal, orig_map_off, size); if (err) return err; } /* update batmap -- clear entries for freed blocks */ for (i = new_entries; i < orig_entries; i++) vhd_batmap_clear(vhd, &vhd->batmap, i); err = vhd_write_batmap(vhd, &vhd->batmap); if (err) return err; return 0; } static int vhd_dynamic_shrink(vhd_journal_t *journal, uint64_t secs) { off64_t eof; uint32_t blocks; vhd_context_t *vhd; int i, j, err, free_cnt; struct vhd_block *free_list; printf("dynamic shrink not fully implemented\n"); return -ENOSYS; eof = 0; free_cnt = 0; free_list = NULL; vhd = &journal->vhd; blocks = secs_to_blocks_down(vhd, secs); if (blocks == 0) return 0; if (vhd_has_batmap(vhd)) { err = vhd_get_batmap(vhd); if (err) return err; } free_list = malloc(blocks * sizeof(struct vhd_block)); if (!free_list) return -ENOMEM; for (i = vhd->bat.entries - 1, j = 0; i >= 0 && j < blocks; i--, j++) { uint32_t blk = vhd->bat.bat[i]; if (blk != DD_BLK_UNUSED) { free_list[free_cnt].block = i; free_list[free_cnt].offset = blk; free_cnt++; } } if (free_cnt) { err = vhd_defrag_shrink(journal, free_list, free_cnt); if (err) goto out; } err = vhd_clear_bat_entries(journal, blocks); if (err) goto out; /* remove data beyond footer */ err = vhd_end_of_data(vhd, &eof); if (err) goto out; err = ftruncate(vhd->fd, eof + sizeof(vhd_footer_t)); if (err) { err = -errno; goto out; } err = 0; out: free(free_list); return err; } static inline void vhd_first_data_block(vhd_context_t *vhd, vhd_block_t *block) { int i; uint32_t blk; memset(block, 0, sizeof(vhd_block_t)); for (i = 0; i < vhd->bat.entries; i++) { blk = vhd->bat.bat[i]; if (blk != DD_BLK_UNUSED) { if (!block->offset || blk < block->offset) { block->block = i; block->offset = blk; } } } } static inline uint32_t vhd_next_block_offset(vhd_context_t *vhd) { int i; uint32_t blk, end, next; next = 0; for (i = 0; i < vhd->bat.entries; i++) { blk = vhd->bat.bat[i]; if (blk != DD_BLK_UNUSED) { end = blk + vhd->spb + vhd->bm_secs; next = MAX(next, end); } } return next; } static inline int in_range(off64_t off, off64_t start, off64_t size) { return (start < off && start + size > off); } #define SKIP_HEADER 0x01 #define SKIP_BAT 0x02 #define SKIP_BATMAP 0x04 #define SKIP_PLOC 0x08 #define SKIP_DATA 0x10 static inline int skip_check(int mode, int type) { return mode & type; } static int vhd_check_for_clobber(vhd_context_t *vhd, off64_t off, int mode) { int i, n; char *msg; size_t size; vhd_block_t fb; vhd_parent_locator_t *loc; msg = NULL; if (!vhd_type_dynamic(vhd)) return 0; if (off < VHD_SECTOR_SIZE) { msg = "backup footer"; goto fail; } if (!skip_check(mode, SKIP_HEADER)) if (in_range(off, vhd->footer.data_offset, sizeof(vhd_header_t))) { msg = "header"; goto fail; } if (!skip_check(mode, SKIP_BAT)) if (in_range(off, vhd->header.table_offset, vhd_bytes_padded(vhd->header.max_bat_size * sizeof(uint32_t)))) { msg = "bat"; goto fail; } if (!skip_check(mode, SKIP_BATMAP)) if (vhd_has_batmap(vhd) && in_range(off, vhd->batmap.header.batmap_offset, vhd_bytes_padded(vhd->batmap.header.batmap_size))) { msg = "batmap"; goto fail; } if (!skip_check(mode, SKIP_PLOC)) { n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); for (i = 0; i < n; i++) { loc = vhd->header.loc + i; if (loc->code == PLAT_CODE_NONE) continue; size = vhd_parent_locator_size(loc); if (in_range(off, loc->data_offset, size)) { msg = "parent locator"; goto fail; } } } if (!skip_check(mode, SKIP_DATA)) { vhd_first_data_block(vhd, &fb); if (fb.offset && in_range(off, vhd_sectors_to_bytes(fb.offset), VHD_BLOCK_SIZE)) { msg = "data block"; goto fail; } } return 0; fail: EPRINTF("write to 0x%08"PRIx64" would clobber %s\n", off, msg); return -EINVAL; } /* * take any metadata after the bat (@eob) and shift it */ static int vhd_shift_metadata(vhd_journal_t *journal, off64_t eob, size_t bat_needed, size_t map_needed) { int i, n, err; vhd_context_t *vhd; size_t size_needed; void *buf; char **locators; vhd_parent_locator_t *loc; vhd = &journal->vhd; size_needed = bat_needed + map_needed; n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); locators = calloc(n, sizeof(char *)); if (!locators) return -ENOMEM; for (i = 0; i < n; i++) { size_t size; loc = vhd->header.loc + i; if (loc->code == PLAT_CODE_NONE) continue; if (loc->data_offset < eob) continue; size = vhd_parent_locator_size(loc); err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) { err = -err; buf = NULL; goto out; } err = vhd_seek(vhd, loc->data_offset, SEEK_SET); if (err) goto out; err = vhd_read(vhd, buf, size); if (err) goto out; locators[i] = buf; } for (i = 0; i < n; i++) { off64_t off; size_t size; if (!locators[i]) continue; loc = vhd->header.loc + i; off = loc->data_offset + size_needed; size = vhd_parent_locator_size(loc); if (vhd_check_for_clobber(vhd, off + size, SKIP_PLOC)) { EPRINTF("%s: shifting locator %d would clobber data\n", vhd->file, i); return -EINVAL; } err = vhd_seek(vhd, off, SEEK_SET); if (err) goto out; err = vhd_write(vhd, locators[i], size); if (err) goto out; free(locators[i]); locators[i] = NULL; loc->data_offset = off; /* write the new header after writing the new bat */ } if (vhd_has_batmap(vhd) && vhd->batmap.header.batmap_offset > eob) { vhd->batmap.header.batmap_offset += bat_needed; /* write the new batmap after writing the new bat */ } err = 0; out: for (i = 0; i < n; i++) free(locators[i]); free(locators); return err; } static int vhd_add_bat_entries(vhd_journal_t *journal, int entries) { int i, err; off64_t off; vhd_bat_t new_bat; vhd_context_t *vhd; uint32_t new_entries; vhd_batmap_t new_batmap; uint64_t bat_size, new_bat_size, map_size, new_map_size; void *bat, *map; vhd = &journal->vhd; new_entries = vhd->header.max_bat_size + entries; bat_size = vhd_bytes_padded(vhd->header.max_bat_size * sizeof(uint32_t)); new_bat_size = vhd_bytes_padded(new_entries * sizeof(uint32_t)); map_size = vhd_bytes_padded((vhd->header.max_bat_size + 7) >> 3); new_map_size = vhd_bytes_padded((new_entries + 7) >> 3); off = vhd->header.table_offset + new_bat_size; if (vhd_check_for_clobber(vhd, off, SKIP_BAT | SKIP_BATMAP)) { EPRINTF("%s: writing new bat of 0x%"PRIx64" bytes " "at 0x%08"PRIx64" would clobber data\n", vhd->file, new_bat_size, vhd->header.table_offset); return -EINVAL; } if (vhd_has_batmap(vhd)) { off = vhd->batmap.header.batmap_offset + new_map_size; if (vhd_check_for_clobber(vhd, off, 0)) { EPRINTF("%s: writing new batmap of 0x%"PRIx64" bytes" " at 0x%08"PRIx64" would clobber data\n", vhd->file, new_map_size, vhd->batmap.header.batmap_offset); return -EINVAL; } } /* update header */ vhd->header.max_bat_size = new_entries; err = vhd_write_header(vhd, &vhd->header); if (err) return err; /* allocate new bat */ err = posix_memalign(&bat, VHD_SECTOR_SIZE, new_bat_size); if (err) return -err; new_bat.bat = bat; new_bat.spb = vhd->bat.spb; new_bat.entries = new_entries; memcpy(new_bat.bat, vhd->bat.bat, bat_size); for (i = vhd->bat.entries; i < new_entries; i++) new_bat.bat[i] = DD_BLK_UNUSED; /* write new bat */ err = vhd_write_bat(vhd, &new_bat); if (err) { free(new_bat.bat); return err; } /* update in-memory bat */ free(vhd->bat.bat); vhd->bat = new_bat; if (!vhd_has_batmap(vhd)) return 0; /* allocate new batmap */ err = posix_memalign(&map, VHD_SECTOR_SIZE, new_map_size); if (err) return err; new_batmap.map = map; new_batmap.header = vhd->batmap.header; new_batmap.header.batmap_size = secs_round_up_no_zero(new_map_size); memcpy(new_batmap.map, vhd->batmap.map, map_size); memset(new_batmap.map + map_size, 0, new_map_size - map_size); /* write new batmap */ err = vhd_write_batmap(vhd, &new_batmap); if (err) { free(new_batmap.map); return err; } /* update in-memory batmap */ free(vhd->batmap.map); vhd->batmap = new_batmap; /* update footer */ vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size; vhd->footer.geometry = vhd_chs(vhd->footer.curr_size); vhd->footer.checksum = vhd_checksum_footer(&vhd->footer); err = vhd_write_footer(vhd, &vhd->footer); if (err) return err; return 0; } static int vhd_dynamic_grow(vhd_journal_t *journal, uint64_t secs) { int err; off64_t eob, eom; vhd_context_t *vhd; vhd_block_t first_block; uint64_t blocks, size_needed; uint64_t bat_needed, bat_size, bat_avail, bat_bytes, bat_secs; uint64_t map_needed, map_size, map_avail, map_bytes, map_secs; vhd = &journal->vhd; size_needed = 0; bat_needed = 0; map_needed = 0; /* number of vhd blocks to add */ blocks = secs_to_blocks_up(vhd, secs); /* size in bytes needed for new bat entries */ bat_needed = blocks * sizeof(uint32_t); map_needed = (blocks >> 3) + 1; /* available bytes in current bat */ bat_bytes = vhd->header.max_bat_size * sizeof(uint32_t); bat_secs = secs_round_up_no_zero(bat_bytes); bat_size = vhd_sectors_to_bytes(bat_secs); bat_avail = bat_size - bat_bytes; if (vhd_has_batmap(vhd)) { /* avaliable bytes in current batmap */ map_bytes = (vhd->header.max_bat_size + 7) >> 3; map_secs = vhd->batmap.header.batmap_size; map_size = vhd_sectors_to_bytes(map_secs); map_avail = map_size - map_bytes; } else { map_needed = 0; map_avail = 0; } /* we have enough space already; just extend the bat */ if (bat_needed <= bat_avail && map_needed <= map_avail) goto add_entries; /* we need to add new sectors to the bat */ if (bat_needed > bat_avail) { bat_needed -= bat_avail; bat_needed = vhd_bytes_padded(bat_needed); } else bat_needed = 0; /* we need to add new sectors to the batmap */ if (map_needed > map_avail) { map_needed -= map_avail; map_needed = vhd_bytes_padded(map_needed); } else map_needed = 0; /* how many additional bytes do we need? */ size_needed = bat_needed + map_needed; /* calculate space between end of headers and beginning of data */ err = vhd_end_of_headers(vhd, &eom); if (err) return err; eob = vhd->header.table_offset + vhd_sectors_to_bytes(bat_secs); vhd_first_data_block(vhd, &first_block); /* no blocks allocated; just shift post-bat metadata */ if (!first_block.offset) goto shift_metadata; /* * not enough space -- * move vhd data blocks to the end of the file to make room */ do { off64_t new_off, bm_size, gap_size; new_off = vhd_sectors_to_bytes(vhd_next_block_offset(vhd)); /* data region of segment should begin on page boundary */ bm_size = vhd_sectors_to_bytes(vhd->bm_secs); if ((new_off + bm_size) % 4096) { gap_size = 4096 - ((new_off + bm_size) % 4096); err = vhd_write_zeros(journal, new_off, gap_size); if (err) return err; new_off += gap_size; } err = vhd_move_block(journal, first_block.block, new_off); if (err) return err; vhd_first_data_block(vhd, &first_block); } while (eom + size_needed >= vhd_sectors_to_bytes(first_block.offset)); TEST_FAIL_AT(FAIL_RESIZE_DATA_MOVED); shift_metadata: /* shift any metadata after the bat to make room for new bat sectors */ err = vhd_shift_metadata(journal, eob, bat_needed, map_needed); if (err) return err; TEST_FAIL_AT(FAIL_RESIZE_METADATA_MOVED); add_entries: return vhd_add_bat_entries(journal, blocks); } static int vhd_dynamic_resize(vhd_journal_t *journal, uint64_t size) { int err; vhd_context_t *vhd; uint64_t cur_secs, new_secs; vhd = &journal->vhd; cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT; new_secs = size << (20 - VHD_SECTOR_SHIFT); if (cur_secs == new_secs) return 0; err = vhd_get_header(vhd); if (err) return err; err = vhd_get_bat(vhd); if (err) return err; if (vhd_has_batmap(vhd)) { err = vhd_get_batmap(vhd); if (err) return err; } if (cur_secs > new_secs) err = vhd_dynamic_shrink(journal, cur_secs - new_secs); else err = vhd_dynamic_grow(journal, new_secs - cur_secs); return err; } static int vhd_util_resize_check_creator(const char *name) { int err; vhd_context_t vhd; err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_STRICT); if (err) { printf("error opening %s: %d\n", name, err); return err; } if (!vhd_creator_tapdisk(&vhd)) { printf("%s not created by xen; resize not supported\n", name); err = -EINVAL; } vhd_close(&vhd); return err; } static int vhd_dynamic_grow_fast(const char *name, uint64_t bytes) { vhd_context_t vhd; uint64_t blks, size; int err; err = vhd_open(&vhd, name, VHD_OPEN_RDWR); if (err) return err; err = vhd_get_bat(&vhd); if (err) goto done; if (vhd_has_batmap(&vhd)) { err = vhd_get_batmap(&vhd); if (err) goto done; } blks = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT; size = blks << VHD_BLOCK_SHIFT; if (size < vhd.footer.curr_size) { printf("%s: size (%"PRIu64") < curr size (%"PRIu64")\n", name, size, vhd.footer.curr_size); err = -EINVAL; goto done; } if (size == vhd.footer.curr_size) goto done; err = vhd_set_virt_size(&vhd, size); done: vhd_close(&vhd); return err; } int vhd_util_resize(int argc, char **argv) { char *name, *jname; uint64_t size; int fast, c, err, jerr; vhd_journal_t journal; vhd_context_t *vhd; err = -EINVAL; size = 0; name = NULL; jname = NULL; fast = 0; optind = 0; while ((c = getopt(argc, argv, "n:s:j:fh")) != -1) { switch (c) { case 'n': name = optarg; break; case 'j': jname = optarg; break; case 'f': fast = 1; break; case 's': err = 0; size = strtoull(optarg, NULL, 10); break; case 'h': default: goto usage; } } if (err || !name || (!jname && !fast) || argc != optind) goto usage; if (jname && fast) goto usage; err = vhd_util_resize_check_creator(name); if (err) return err; libvhd_set_log_level(1); if (fast) return vhd_dynamic_grow_fast(name, size << 20); err = vhd_journal_create(&journal, name, jname); if (err) { printf("creating journal failed: %d\n", err); return err; } vhd = &journal.vhd; err = vhd_get_footer(vhd); if (err) goto out; TEST_FAIL_AT(FAIL_RESIZE_BEGIN); if (vhd_type_dynamic(vhd)) err = vhd_dynamic_resize(&journal, size); else err = vhd_fixed_resize(&journal, size); TEST_FAIL_AT(FAIL_RESIZE_END); out: if (err) { printf("resize failed: %d\n", err); jerr = vhd_journal_revert(&journal); } else jerr = vhd_journal_commit(&journal); if (jerr) { printf("closing journal failed: %d\n", jerr); vhd_journal_close(&journal); } else vhd_journal_remove(&journal); return (err ? : jerr); usage: printf("options: <-n name> <-s size (in MB)> (<-j journal>|<-f fast>) " "[-h help]\n\n" "The resize operation can only be performed offline " "and must be journaled because resizing the metadata " "might require moving data blocks. However, if a " "VHD was created with -S option (during " "vhd-util create/snapshot), which preallocates the " "metadata for growing the VHD up to size , then " "resizing such a VHD up to can be performed " "online without journaling (-f option).\n"); return -EINVAL; } blktap-2.0.90/vhd/lib/vhd-util-coalesce.c0000644000000000000000000003365111664745551016606 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "libvhd.h" static int __raw_io_write(int fd, char* buf, uint64_t sec, uint32_t secs) { off64_t off; size_t ret; errno = 0; off = lseek64(fd, vhd_sectors_to_bytes(sec), SEEK_SET); if (off == (off64_t)-1) { printf("raw parent: seek(0x%08"PRIx64") failed: %d\n", vhd_sectors_to_bytes(sec), -errno); return -errno; } ret = write(fd, buf, vhd_sectors_to_bytes(secs)); if (ret == vhd_sectors_to_bytes(secs)) return 0; printf("raw parent: write of 0x%"PRIx64" returned %zd, errno: %d\n", vhd_sectors_to_bytes(secs), ret, -errno); return (errno ? -errno : -EIO); } /* * Use 'parent' if the parent is VHD, and 'parent_fd' if the parent is raw */ static int vhd_util_coalesce_block(vhd_context_t *vhd, vhd_context_t *parent, int parent_fd, uint64_t block) { int i, err; void *buf; char *map; uint64_t sec, secs; buf = NULL; map = NULL; sec = block * vhd->spb; if (vhd->bat.bat[block] == DD_BLK_UNUSED) return 0; err = posix_memalign(&buf, 4096, vhd->header.block_size); if (err) return -err; err = vhd_io_read(vhd, buf, sec, vhd->spb); if (err) goto done; if (vhd_has_batmap(vhd) && vhd_batmap_test(vhd, &vhd->batmap, block)) { if (parent->file) err = vhd_io_write(parent, buf, sec, vhd->spb); else err = __raw_io_write(parent_fd, buf, sec, vhd->spb); goto done; } err = vhd_read_bitmap(vhd, block, &map); if (err) goto done; for (i = 0; i < vhd->spb; i++) { if (!vhd_bitmap_test(vhd, map, i)) continue; for (secs = 0; i + secs < vhd->spb; secs++) if (!vhd_bitmap_test(vhd, map, i + secs)) break; if (parent->file) err = vhd_io_write(parent, buf + vhd_sectors_to_bytes(i), sec + i, secs); else err = __raw_io_write(parent_fd, buf + vhd_sectors_to_bytes(i), sec + i, secs); if (err) goto done; i += secs; } err = 0; done: free(buf); free(map); return err; } static int vhd_util_coalesce_onto(vhd_context_t *from, vhd_context_t *to, int to_fd, int progress) { int err; uint64_t i; err = vhd_get_bat(from); if (err) goto out; if (vhd_has_batmap(from)) { err = vhd_get_batmap(from); if (err) goto out; } for (i = 0; i < from->bat.entries; i++) { if (progress) { printf("\r%6.2f%%", ((float)i / (float)from->bat.entries) * 100.00); fflush(stdout); } err = vhd_util_coalesce_block(from, to, to_fd, i); if (err) goto out; } err = 0; if (progress) printf("\r100.00%%\n"); out: return err; } static int vhd_util_coalesce_parent(const char *name, int sparse, int progress) { char *pname; int err, parent_fd; vhd_context_t vhd, parent; parent_fd = -1; parent.file = NULL; err = vhd_open(&vhd, name, VHD_OPEN_RDONLY); if (err) { printf("error opening %s: %d\n", name, err); return err; } err = vhd_parent_locator_get(&vhd, &pname); if (err) { printf("error finding %s parent: %d\n", name, err); vhd_close(&vhd); return err; } if (vhd_parent_raw(&vhd)) { parent_fd = open(pname, O_RDWR | O_DIRECT | O_LARGEFILE, 0644); if (parent_fd == -1) { err = -errno; printf("failed to open parent %s: %d\n", pname, err); vhd_close(&vhd); return err; } } else { int flags = (sparse ? VHD_OPEN_IO_WRITE_SPARSE : 0); if (sparse) printf("opening for sparse writes\n"); err = vhd_open(&parent, pname, VHD_OPEN_RDWR | flags); if (err) { printf("error opening %s: %d\n", pname, err); free(pname); vhd_close(&vhd); return err; } } err = vhd_util_coalesce_onto(&vhd, &parent, parent_fd, progress); free(pname); vhd_close(&vhd); if (parent.file) vhd_close(&parent); else close(parent_fd); return err; } struct vhd_list_entry { int raw; int raw_fd; vhd_context_t vhd; struct list_head next; }; static int vhd_util_pathcmp(const char *a, const char *b, int *cmp) { int err; char *apath = NULL, __apath[PATH_MAX]; char *bpath = NULL, __bpath[PATH_MAX]; apath = realpath(a, __apath); if (!apath) { err = -errno; goto out; } bpath = realpath(b, __bpath); if (!bpath) { err = -errno; goto out; } *cmp = strcmp(apath, bpath); err = 0; out: return err; } static void vhd_util_coalesce_free_chain(struct list_head *head) { struct vhd_list_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, head, next) { if (entry->raw) close(entry->raw_fd); else vhd_close(&entry->vhd); list_del(&entry->next); free(entry); } INIT_LIST_HEAD(head); } static int vhd_util_coalesce_load_chain(struct list_head *head, const char *cname, const char *aname, int sparse) { char *next; vhd_context_t *child; int err, cmp, vhd_flags; struct vhd_list_entry *entry; next = NULL; entry = NULL; INIT_LIST_HEAD(head); vhd_flags = VHD_OPEN_RDWR | (sparse ? VHD_OPEN_IO_WRITE_SPARSE : 0); err = vhd_util_pathcmp(cname, aname, &cmp); if (err) goto out; if (!cmp) { err = -EINVAL; goto out; } entry = calloc(1, sizeof(*entry)); if (!entry) goto out; err = vhd_open(&entry->vhd, cname, vhd_flags); if (err) goto out; err = vhd_get_bat(&entry->vhd); if (err) goto out; if (vhd_has_batmap(&entry->vhd)) { err = vhd_get_batmap(&entry->vhd); if (err) goto out; } child = &entry->vhd; list_add(&entry->next, head); for (;;) { int raw; if (entry->raw || entry->vhd.footer.type != HD_TYPE_DIFF) { err = -ENOENT; goto out; } if (child->header.block_size != entry->vhd.header.block_size) { err = -EINVAL; goto out; } err = vhd_parent_locator_get(&entry->vhd, &next); if (err) goto out; raw = vhd_parent_raw(&entry->vhd); entry = calloc(1, sizeof(*entry)); if (!entry) goto out; if (raw) { entry->raw = raw; entry->raw_fd = open(next, O_RDWR | O_DIRECT | O_LARGEFILE); if (entry->raw_fd == -1) { err = -errno; goto out; } } else { err = vhd_open(&entry->vhd, next, vhd_flags); if (err) goto out; err = vhd_get_bat(&entry->vhd); if (err) goto out; if (vhd_has_batmap(&entry->vhd)) { err = vhd_get_batmap(&entry->vhd); if (err) goto out; } } list_add_tail(&entry->next, head); err = vhd_util_pathcmp(next, aname, &cmp); if (err) goto out; if (!cmp) goto done; free(next); next = NULL; } done: err = 0; out: if (err) { if (entry && list_empty(&entry->next)) { if (entry->vhd.file) vhd_close(&entry->vhd); else if (entry->raw) close(entry->raw_fd); free(entry); } vhd_util_coalesce_free_chain(head); } return err; } static int vhd_util_coalesce_clear_bitmap(vhd_context_t *child, char *cmap, vhd_context_t *ancestor, const uint64_t block) { char *amap = NULL; int i, dirty, err; if (child->spb != ancestor->spb) { err = -EINVAL; goto out; } if (block >= ancestor->bat.entries) goto done; if (ancestor->bat.bat[block] == DD_BLK_UNUSED) goto done; err = vhd_read_bitmap(ancestor, block, &amap); if (err) goto out; for (i = 0; i < child->spb; i++) { if (vhd_bitmap_test(child, cmap, i)) { if (vhd_bitmap_test(ancestor, amap, i)) { dirty = 1; vhd_bitmap_clear(ancestor, amap, i); } } } if (dirty) { err = vhd_write_bitmap(ancestor, block, amap); if (err) goto out; if (vhd_has_batmap(ancestor) && vhd_batmap_test(ancestor, &ancestor->batmap, block)) { vhd_batmap_clear(ancestor, &ancestor->batmap, block); err = vhd_write_batmap(ancestor, &ancestor->batmap); if (err) goto out; } } done: err = 0; out: free(amap); return err; } static int vhd_util_coalesce_clear_bitmaps(struct list_head *chain, vhd_context_t *child, vhd_context_t *ancestor, uint64_t block) { int err; char *map = NULL; struct vhd_list_entry *entry; if (child->bat.bat[block] == DD_BLK_UNUSED) goto done; err = vhd_read_bitmap(child, block, &map); if (err) goto out; list_for_each_entry(entry, chain, next) { if (&entry->vhd == child) continue; if (&entry->vhd == ancestor) break; err = vhd_util_coalesce_clear_bitmap(child, map, &entry->vhd, block); if (err) goto out; } done: err = 0; out: free(map); return err; } static int vhd_util_coalesce_ancestor(const char *cname, const char *aname, int sparse, int progress) { uint64_t i; int err, raw_fd; struct list_head chain; struct vhd_list_entry *entry; vhd_context_t *child, *ancestor; child = NULL; ancestor = NULL; err = vhd_util_coalesce_load_chain(&chain, cname, aname, sparse); if (err) goto out; list_for_each_entry(entry, &chain, next) { if (!child) child = &entry->vhd; else if (list_is_last(&entry->next, &chain)) { ancestor = &entry->vhd; raw_fd = entry->raw_fd; break; } } if (!ancestor) { err = -EINVAL; goto out; } err = vhd_util_coalesce_onto(child, ancestor, raw_fd, progress); if (err) goto out; for (i = 0; i < child->bat.entries; i++) { err = vhd_util_coalesce_clear_bitmaps(&chain, child, ancestor, i); if (err) goto out; } out: vhd_util_coalesce_free_chain(&chain); return err; } static int vhd_util_coalesce_open_output(vhd_context_t *dst, vhd_context_t *src, const char *name, int flags) { int err; err = access(name, F_OK); if (!err) { printf("%s already exists\n", name); return -EEXIST; } else if (errno != ENOENT) { printf("error checking %s: %d\n", name, errno); return -errno; } err = vhd_create(name, src->footer.curr_size, HD_TYPE_DYNAMIC, 0, 0); if (err) { printf("error creating %s: %d\n", name, err); return err; } err = vhd_open(dst, name, VHD_OPEN_RDWR | flags); if (err || dst->header.block_size != src->header.block_size) { printf("error opening %s: %d\n", name, (err ? : EINVAL)); unlink(name); return err ? : EINVAL; } return 0; } /* * read block from @src chain and write it to @dst, unless it is all zeros */ static int vhd_util_coalesce_block_out(vhd_context_t *dst, vhd_context_t *src, uint64_t block) { int i, err; uint64_t sec; void *buf; char *p; buf = NULL; sec = block * src->spb; err = posix_memalign(&buf, 4096, src->header.block_size); if (err) return -err; err = vhd_io_read(src, buf, sec, src->spb); if (err) goto done; for (p = buf, i = 0; i < src->header.block_size; i++, p++) { if (*p) { err = vhd_io_write(dst, buf, sec, src->spb); break; } } done: free(buf); return err; } static int vhd_util_coalesce_out(const char *src_name, const char *dst_name, int sparse, int progress) { uint64_t i; int err, flags; vhd_context_t src, dst; err = vhd_open(&src, src_name, VHD_OPEN_RDONLY | VHD_OPEN_CACHED); if (err) return err; flags = (sparse ? VHD_OPEN_IO_WRITE_SPARSE : 0); err = vhd_util_coalesce_open_output(&dst, &src, dst_name, flags); if (err) { vhd_close(&src); return err; } err = vhd_get_bat(&src); if (err) goto done; if (vhd_has_batmap(&src)) { err = vhd_get_batmap(&src); if (err) goto done; } for (i = 0; i < src.bat.entries; i++) { if (progress) { printf("\r%6.2f%%", ((float)i / (float)src.bat.entries) * 100.0); fflush(stdout); } err = vhd_util_coalesce_block_out(&dst, &src, i); if (err) goto done; } err = 0; if (progress) printf("\r100.00%%\n"); done: if (err) unlink(dst.file); vhd_close(&src); vhd_close(&dst); return err; } int vhd_util_coalesce(int argc, char **argv) { char *name, *oname, *ancestor; int err, c, progress, sparse; name = NULL; oname = NULL; ancestor = NULL; sparse = 0; progress = 0; if (!argc || !argv) goto usage; optind = 0; while ((c = getopt(argc, argv, "n:o:a:sph")) != -1) { switch (c) { case 'n': name = optarg; break; case 'o': oname = optarg; break; case 'a': ancestor = optarg; break; case 's': sparse = 1; break; case 'p': progress = 1; break; case 'h': default: goto usage; } } if (!name || optind != argc) goto usage; if (oname && ancestor) goto usage; if (oname) err = vhd_util_coalesce_out(name, oname, sparse, progress); else if (ancestor) err = vhd_util_coalesce_ancestor(name, ancestor, sparse, progress); else err = vhd_util_coalesce_parent(name, sparse, progress); if (err) printf("error coalescing: %d\n", err); return err; usage: printf("options: <-n name> [-a ancestor] " "[-o output] [-s sparse] [-p progress] [-h help]\n"); return -EINVAL; } blktap-2.0.90/vhd/lib/libvhd-journal.c0000644000000000000000000006777411664745551016233 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "atomicio.h" #include "libvhd-journal.h" #define VHD_JOURNAL_ENTRY_TYPE_FOOTER_P 1 #define VHD_JOURNAL_ENTRY_TYPE_FOOTER_C 2 #define VHD_JOURNAL_ENTRY_TYPE_HEADER 3 #define VHD_JOURNAL_ENTRY_TYPE_LOCATOR 4 #define VHD_JOURNAL_ENTRY_TYPE_BAT 5 #define VHD_JOURNAL_ENTRY_TYPE_BATMAP_H 6 #define VHD_JOURNAL_ENTRY_TYPE_BATMAP_M 7 #define VHD_JOURNAL_ENTRY_TYPE_DATA 8 typedef struct vhd_journal_entry { uint64_t cookie; uint32_t type; uint32_t size; uint64_t offset; uint32_t checksum; } vhd_journal_entry_t; static inline int vhd_journal_seek(vhd_journal_t *j, off64_t offset, int whence) { off64_t off; off = lseek64(j->jfd, offset, whence); if (off == (off64_t)-1) return -errno; return 0; } static inline off64_t vhd_journal_position(vhd_journal_t *j) { return lseek64(j->jfd, 0, SEEK_CUR); } static inline int vhd_journal_read(vhd_journal_t *j, void *buf, size_t size) { ssize_t ret; errno = 0; ret = atomicio(read, j->jfd, buf, size); if (ret != size) return (errno ? -errno : -EIO); return 0; } static inline int vhd_journal_write(vhd_journal_t *j, void *buf, size_t size) { ssize_t ret; errno = 0; ret = atomicio(vwrite, j->jfd, buf, size); if (ret != size) return (errno ? -errno : -EIO); return 0; } static inline int vhd_journal_truncate(vhd_journal_t *j, off64_t length) { int err; err = ftruncate(j->jfd, length); if (err == -1) return -errno; return 0; } static inline int vhd_journal_sync(vhd_journal_t *j) { int err; err = fdatasync(j->jfd); if (err) return -errno; return 0; } static inline void vhd_journal_header_in(vhd_journal_header_t *header) { BE64_IN(&header->vhd_footer_offset); BE32_IN(&header->journal_data_entries); BE32_IN(&header->journal_metadata_entries); BE64_IN(&header->journal_data_offset); BE64_IN(&header->journal_metadata_offset); } static inline void vhd_journal_header_out(vhd_journal_header_t *header) { BE64_OUT(&header->vhd_footer_offset); BE32_OUT(&header->journal_data_entries); BE32_OUT(&header->journal_metadata_entries); BE64_OUT(&header->journal_data_offset); BE64_OUT(&header->journal_metadata_offset); } static int vhd_journal_validate_header(vhd_journal_t *j, vhd_journal_header_t *header) { int err; off64_t eof; if (memcmp(header->cookie, VHD_JOURNAL_HEADER_COOKIE, sizeof(header->cookie))) return -EINVAL; err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET); if (err) return err; eof = vhd_journal_position(j); if (eof == (off64_t)-1) return -errno; if (j->header.journal_data_offset > j->header.journal_eof) return -EINVAL; if (j->header.journal_metadata_offset > j->header.journal_eof) return -EINVAL; return 0; } static int vhd_journal_read_journal_header(vhd_journal_t *j, vhd_journal_header_t *header) { int err; size_t size; size = sizeof(vhd_journal_header_t); err = vhd_journal_seek(j, 0, SEEK_SET); if (err) return err; err = vhd_journal_read(j, header, size); if (err) return err; vhd_journal_header_in(header); return vhd_journal_validate_header(j, header); } static int vhd_journal_write_header(vhd_journal_t *j, vhd_journal_header_t *header) { int err; size_t size; vhd_journal_header_t h; memcpy(&h, header, sizeof(vhd_journal_header_t)); err = vhd_journal_validate_header(j, &h); if (err) return err; vhd_journal_header_out(&h); size = sizeof(vhd_journal_header_t); err = vhd_journal_seek(j, 0, SEEK_SET); if (err) return err; err = vhd_journal_write(j, &h, size); if (err) return err; return 0; } static int vhd_journal_add_journal_header(vhd_journal_t *j) { int err; off64_t off; vhd_context_t *vhd; vhd = &j->vhd; memset(&j->header, 0, sizeof(vhd_journal_header_t)); err = vhd_seek(vhd, 0, SEEK_END); if (err) return err; off = vhd_position(vhd); if (off == (off64_t)-1) return -errno; err = vhd_get_footer(vhd); if (err) return err; uuid_copy(j->header.uuid, vhd->footer.uuid); memcpy(j->header.cookie, VHD_JOURNAL_HEADER_COOKIE, sizeof(j->header.cookie)); j->header.vhd_footer_offset = off - sizeof(vhd_footer_t); j->header.journal_eof = sizeof(vhd_journal_header_t); return vhd_journal_write_header(j, &j->header); } static void vhd_journal_entry_in(vhd_journal_entry_t *entry) { BE32_IN(&entry->type); BE32_IN(&entry->size); BE64_IN(&entry->offset); BE64_IN(&entry->cookie); BE32_IN(&entry->checksum); } static void vhd_journal_entry_out(vhd_journal_entry_t *entry) { BE32_OUT(&entry->type); BE32_OUT(&entry->size); BE64_OUT(&entry->offset); BE64_OUT(&entry->cookie); BE32_OUT(&entry->checksum); } static uint32_t vhd_journal_checksum_entry(vhd_journal_entry_t *entry, char *buf, size_t size) { int i; unsigned char *blob; uint32_t checksum, tmp; checksum = 0; tmp = entry->checksum; entry->checksum = 0; blob = (unsigned char *)entry; for (i = 0; i < sizeof(vhd_journal_entry_t); i++) checksum += blob[i]; blob = (unsigned char *)buf; for (i = 0; i < size; i++) checksum += blob[i]; entry->checksum = tmp; return ~checksum; } static int vhd_journal_validate_entry(vhd_journal_entry_t *entry) { if (entry->size == 0) return -EINVAL; if (entry->size & (VHD_SECTOR_SIZE - 1)) return -EINVAL; if (entry->cookie != VHD_JOURNAL_ENTRY_COOKIE) return -EINVAL; return 0; } static int vhd_journal_read_entry(vhd_journal_t *j, vhd_journal_entry_t *entry) { int err; err = vhd_journal_read(j, entry, sizeof(vhd_journal_entry_t)); if (err) return err; vhd_journal_entry_in(entry); return vhd_journal_validate_entry(entry); } static int vhd_journal_write_entry(vhd_journal_t *j, vhd_journal_entry_t *entry) { int err; vhd_journal_entry_t e; err = vhd_journal_validate_entry(entry); if (err) return err; memcpy(&e, entry, sizeof(vhd_journal_entry_t)); vhd_journal_entry_out(&e); err = vhd_journal_write(j, &e, sizeof(vhd_journal_entry_t)); if (err) return err; return 0; } static int vhd_journal_validate_entry_data(vhd_journal_entry_t *entry, char *buf) { int err; uint32_t checksum; err = 0; checksum = vhd_journal_checksum_entry(entry, buf, entry->size); if (checksum != entry->checksum) return -EINVAL; return err; } static int vhd_journal_update(vhd_journal_t *j, off64_t offset, char *buf, size_t size, uint32_t type) { int err; uint64_t *off, off_bak; uint32_t *entries; vhd_journal_entry_t entry; entry.type = type; entry.size = size; entry.offset = offset; entry.cookie = VHD_JOURNAL_ENTRY_COOKIE; entry.checksum = vhd_journal_checksum_entry(&entry, buf, size); err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET); if (err) return err; err = vhd_journal_write_entry(j, &entry); if (err) goto fail; err = vhd_journal_write(j, buf, size); if (err) goto fail; if (type == VHD_JOURNAL_ENTRY_TYPE_DATA) { off = &j->header.journal_data_offset; entries = &j->header.journal_data_entries; } else { off = &j->header.journal_metadata_offset; entries = &j->header.journal_metadata_entries; } off_bak = *off; if (!(*entries)++) *off = j->header.journal_eof; j->header.journal_eof += (size + sizeof(vhd_journal_entry_t)); err = vhd_journal_write_header(j, &j->header); if (err) { if (!--(*entries)) *off = off_bak; j->header.journal_eof -= (size + sizeof(vhd_journal_entry_t)); goto fail; } return 0; fail: if (!j->is_block) vhd_journal_truncate(j, j->header.journal_eof); return err; } static int vhd_journal_add_footer(vhd_journal_t *j) { int err; off64_t off; vhd_context_t *vhd; vhd_footer_t footer; vhd = &j->vhd; err = vhd_seek(vhd, 0, SEEK_END); if (err) return err; off = vhd_position(vhd); if (off == (off64_t)-1) return -errno; err = vhd_read_footer_at(vhd, &footer, off - sizeof(vhd_footer_t)); if (err) return err; vhd_footer_out(&footer); err = vhd_journal_update(j, off - sizeof(vhd_footer_t), (char *)&footer, sizeof(vhd_footer_t), VHD_JOURNAL_ENTRY_TYPE_FOOTER_P); if (err) return err; if (!vhd_type_dynamic(vhd)) return 0; err = vhd_read_footer_at(vhd, &footer, 0); if (err) return err; vhd_footer_out(&footer); err = vhd_journal_update(j, 0, (char *)&footer, sizeof(vhd_footer_t), VHD_JOURNAL_ENTRY_TYPE_FOOTER_C); return err; } static int vhd_journal_add_header(vhd_journal_t *j) { int err; off64_t off; vhd_context_t *vhd; vhd_header_t header; vhd = &j->vhd; err = vhd_read_header(vhd, &header); if (err) return err; off = vhd->footer.data_offset; vhd_header_out(&header); err = vhd_journal_update(j, off, (char *)&header, sizeof(vhd_header_t), VHD_JOURNAL_ENTRY_TYPE_HEADER); return err; } static int vhd_journal_add_locators(vhd_journal_t *j) { int i, n, err; vhd_context_t *vhd; vhd = &j->vhd; err = vhd_get_header(vhd); if (err) return err; n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); for (i = 0; i < n; i++) { void *buf; off64_t off; size_t size; vhd_parent_locator_t *loc; loc = vhd->header.loc + i; err = vhd_validate_platform_code(loc->code); if (err) return err; if (loc->code == PLAT_CODE_NONE) continue; off = loc->data_offset; size = vhd_parent_locator_size(loc); err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) return -err; err = vhd_seek(vhd, off, SEEK_SET); if (err) goto end; err = vhd_read(vhd, buf, size); if (err) goto end; err = vhd_journal_update(j, off, buf, size, VHD_JOURNAL_ENTRY_TYPE_LOCATOR); if (err) goto end; err = 0; end: free(buf); if (err) break; } return err; } static int vhd_journal_add_bat(vhd_journal_t *j) { int err; off64_t off; size_t size; vhd_bat_t bat; vhd_context_t *vhd; vhd = &j->vhd; err = vhd_get_header(vhd); if (err) return err; err = vhd_read_bat(vhd, &bat); if (err) return err; off = vhd->header.table_offset; size = vhd_bytes_padded(bat.entries * sizeof(uint32_t)); vhd_bat_out(&bat); err = vhd_journal_update(j, off, (char *)bat.bat, size, VHD_JOURNAL_ENTRY_TYPE_BAT); free(bat.bat); return err; } static int vhd_journal_add_batmap(vhd_journal_t *j) { int err; off64_t off; size_t size; vhd_context_t *vhd; vhd_batmap_t batmap; vhd = &j->vhd; err = vhd_batmap_header_offset(vhd, &off); if (err) return err; err = vhd_read_batmap(vhd, &batmap); if (err) return err; size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr)); vhd_batmap_header_out(&batmap); err = vhd_journal_update(j, off, (char *)&batmap.header, size, VHD_JOURNAL_ENTRY_TYPE_BATMAP_H); if (err) goto out; vhd_batmap_header_in(&batmap); off = batmap.header.batmap_offset; size = vhd_sectors_to_bytes(batmap.header.batmap_size); err = vhd_journal_update(j, off, batmap.map, size, VHD_JOURNAL_ENTRY_TYPE_BATMAP_M); out: free(batmap.map); return err; } static int vhd_journal_add_metadata(vhd_journal_t *j) { int err; vhd_context_t *vhd; vhd = &j->vhd; err = vhd_journal_add_footer(j); if (err) return err; if (!vhd_type_dynamic(vhd)) return 0; err = vhd_journal_add_header(j); if (err) return err; err = vhd_journal_add_locators(j); if (err) return err; err = vhd_journal_add_bat(j); if (err) return err; if (vhd_has_batmap(vhd)) { err = vhd_journal_add_batmap(j); if (err) return err; } j->header.journal_data_offset = j->header.journal_eof; return vhd_journal_write_header(j, &j->header); } static int __vhd_journal_read_footer(vhd_journal_t *j, vhd_footer_t *footer, uint32_t type) { int err; vhd_journal_entry_t entry; err = vhd_journal_read_entry(j, &entry); if (err) return err; if (entry.type != type) return -EINVAL; if (entry.size != sizeof(vhd_footer_t)) return -EINVAL; err = vhd_journal_read(j, footer, entry.size); if (err) return err; vhd_footer_in(footer); return vhd_validate_footer(footer); } static int vhd_journal_read_footer(vhd_journal_t *j, vhd_footer_t *footer) { return __vhd_journal_read_footer(j, footer, VHD_JOURNAL_ENTRY_TYPE_FOOTER_P); } static int vhd_journal_read_footer_copy(vhd_journal_t *j, vhd_footer_t *footer) { return __vhd_journal_read_footer(j, footer, VHD_JOURNAL_ENTRY_TYPE_FOOTER_C); } static int vhd_journal_read_header(vhd_journal_t *j, vhd_header_t *header) { int err; vhd_journal_entry_t entry; err = vhd_journal_read_entry(j, &entry); if (err) return err; if (entry.type != VHD_JOURNAL_ENTRY_TYPE_HEADER) return -EINVAL; if (entry.size != sizeof(vhd_header_t)) return -EINVAL; err = vhd_journal_read(j, header, entry.size); if (err) return err; vhd_header_in(header); return vhd_validate_header(header); } static int vhd_journal_read_locators(vhd_journal_t *j, char ***locators, int *locs) { int err, n, _locs; char **_locators; void *buf; off_t pos; vhd_journal_entry_t entry; _locs = 0; *locs = 0; *locators = NULL; n = sizeof(j->vhd.header.loc) / sizeof(vhd_parent_locator_t); _locators = calloc(n, sizeof(char *)); if (!_locators) return -ENOMEM; for (;;) { buf = NULL; pos = vhd_journal_position(j); err = vhd_journal_read_entry(j, &entry); if (err) goto fail; if (entry.type != VHD_JOURNAL_ENTRY_TYPE_LOCATOR) { err = vhd_journal_seek(j, pos, SEEK_SET); if (err) goto fail; break; } if (_locs >= n) { err = -EINVAL; goto fail; } err = posix_memalign(&buf, VHD_SECTOR_SIZE, entry.size); if (err) { err = -err; buf = NULL; goto fail; } err = vhd_journal_read(j, buf, entry.size); if (err) goto fail; _locators[_locs++] = buf; err = 0; } *locs = _locs; *locators = _locators; return 0; fail: if (_locators) { for (n = 0; n < _locs; n++) free(_locators[n]); free(_locators); } return err; } static int vhd_journal_read_bat(vhd_journal_t *j, vhd_bat_t *bat) { int err; size_t size; vhd_context_t *vhd; vhd_journal_entry_t entry; void *_bat; vhd = &j->vhd; size = vhd_bytes_padded(vhd->header.max_bat_size * sizeof(uint32_t)); err = vhd_journal_read_entry(j, &entry); if (err) return err; if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BAT) return -EINVAL; if (entry.size != size) return -EINVAL; if (entry.offset != vhd->header.table_offset) return -EINVAL; err = posix_memalign(&_bat, VHD_SECTOR_SIZE, size); if (err) return -err; bat->bat = _bat; err = vhd_journal_read(j, bat->bat, entry.size); if (err) goto fail; bat->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT; bat->entries = vhd->header.max_bat_size; vhd_bat_in(bat); return 0; fail: free(bat->bat); bat->bat = NULL; return err; } static int vhd_journal_read_batmap_header(vhd_journal_t *j, vhd_batmap_t *batmap) { int err; void *buf; size_t size; vhd_journal_entry_t entry; size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr)); err = vhd_journal_read_entry(j, &entry); if (err) return err; if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_H) return -EINVAL; if (entry.size != size) return -EINVAL; err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) return err; err = vhd_journal_read(j, buf, entry.size); if (err) { free(buf); return err; } memcpy(&batmap->header, buf, sizeof(batmap->header)); vhd_batmap_header_in(batmap); return vhd_validate_batmap_header(batmap); } static int vhd_journal_read_batmap_map(vhd_journal_t *j, vhd_batmap_t *batmap) { int err; vhd_journal_entry_t entry; void *map; err = vhd_journal_read_entry(j, &entry); if (err) return err; if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_M) return -EINVAL; if (entry.size != vhd_sectors_to_bytes(batmap->header.batmap_size)) return -EINVAL; if (entry.offset != batmap->header.batmap_offset) return -EINVAL; err = posix_memalign(&map, VHD_SECTOR_SIZE, entry.size); if (err) return -err; batmap->map = map; err = vhd_journal_read(j, batmap->map, entry.size); if (err) { free(batmap->map); batmap->map = NULL; return err; } return 0; } static int vhd_journal_read_batmap(vhd_journal_t *j, vhd_batmap_t *batmap) { int err; err = vhd_journal_read_batmap_header(j, batmap); if (err) return err; err = vhd_journal_read_batmap_map(j, batmap); if (err) return err; err = vhd_validate_batmap(&j->vhd, batmap); if (err) { free(batmap->map); batmap->map = NULL; return err; } return 0; } static int vhd_journal_restore_footer(vhd_journal_t *j, vhd_footer_t *footer) { return vhd_write_footer_at(&j->vhd, footer, j->header.vhd_footer_offset); } static int vhd_journal_restore_footer_copy(vhd_journal_t *j, vhd_footer_t *footer) { return vhd_write_footer_at(&j->vhd, footer, 0); } static int vhd_journal_restore_header(vhd_journal_t *j, vhd_header_t *header) { off64_t off; vhd_context_t *vhd; vhd = &j->vhd; off = vhd->footer.data_offset; return vhd_write_header_at(&j->vhd, header, off); } static int vhd_journal_restore_locators(vhd_journal_t *j, char **locators, int locs) { size_t size; vhd_context_t *vhd; int i, n, lidx, err; vhd_parent_locator_t *loc; lidx = 0; vhd = &j->vhd; n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); for (i = 0; i < n && lidx < locs; i++) { loc = vhd->header.loc + i; if (loc->code == PLAT_CODE_NONE) continue; err = vhd_seek(vhd, loc->data_offset, SEEK_SET); if (err) return err; size = vhd_parent_locator_size(loc); err = vhd_write(vhd, locators[lidx++], size); if (err) return err; } return 0; } static int vhd_journal_restore_bat(vhd_journal_t *j, vhd_bat_t *bat) { return vhd_write_bat(&j->vhd, bat); } static int vhd_journal_restore_batmap(vhd_journal_t *j, vhd_batmap_t *batmap) { return vhd_write_batmap(&j->vhd, batmap); } static int vhd_journal_restore_metadata(vhd_journal_t *j) { off64_t off; char **locators; vhd_footer_t copy; vhd_context_t *vhd; int i, locs, hlocs, err; vhd = &j->vhd; locs = 0; hlocs = 0; locators = NULL; err = vhd_journal_seek(j, sizeof(vhd_journal_header_t), SEEK_SET); if (err) return err; err = vhd_journal_read_footer(j, &vhd->footer); if (err) return err; if (!vhd_type_dynamic(vhd)) goto restore; err = vhd_journal_read_footer_copy(j, ©); if (err) return err; err = vhd_journal_read_header(j, &vhd->header); if (err) return err; for (hlocs = 0, i = 0; i < vhd_parent_locator_count(vhd); i++) { if (vhd_validate_platform_code(vhd->header.loc[i].code)) return err; if (vhd->header.loc[i].code != PLAT_CODE_NONE) hlocs++; } if (hlocs) { err = vhd_journal_read_locators(j, &locators, &locs); if (err) return err; if (hlocs != locs) { err = -EINVAL; goto out; } } err = vhd_journal_read_bat(j, &vhd->bat); if (err) goto out; if (vhd_has_batmap(vhd)) { err = vhd_journal_read_batmap(j, &vhd->batmap); if (err) goto out; } restore: off = vhd_journal_position(j); if (off == (off64_t)-1) return -errno; if (j->header.journal_data_offset != off) return -EINVAL; err = vhd_journal_restore_footer(j, &vhd->footer); if (err) goto out; if (!vhd_type_dynamic(vhd)) goto out; err = vhd_journal_restore_footer_copy(j, ©); if (err) goto out; err = vhd_journal_restore_header(j, &vhd->header); if (err) goto out; if (locs) { err = vhd_journal_restore_locators(j, locators, locs); if (err) goto out; } err = vhd_journal_restore_bat(j, &vhd->bat); if (err) goto out; if (vhd_has_batmap(vhd)) { err = vhd_journal_restore_batmap(j, &vhd->batmap); if (err) goto out; } err = 0; out: if (locators) { for (i = 0; i < locs; i++) free(locators[i]); free(locators); } if (!err && !vhd->is_block) err = ftruncate(vhd->fd, j->header.vhd_footer_offset + sizeof(vhd_footer_t)); return err; } static int vhd_journal_disable_vhd(vhd_journal_t *j) { int err; vhd_context_t *vhd; vhd = &j->vhd; err = vhd_get_footer(vhd); if (err) return err; memcpy(&vhd->footer.cookie, VHD_POISON_COOKIE, sizeof(vhd->footer.cookie)); vhd->footer.checksum = vhd_checksum_footer(&vhd->footer); err = vhd_write_footer(vhd, &vhd->footer); if (err) return err; return 0; } static int vhd_journal_enable_vhd(vhd_journal_t *j) { int err; vhd_context_t *vhd; vhd = &j->vhd; err = vhd_get_footer(vhd); if (err) return err; if (!vhd_disabled(vhd)) return 0; memcpy(&vhd->footer.cookie, HD_COOKIE, sizeof(vhd->footer.cookie)); vhd->footer.checksum = vhd_checksum_footer(&vhd->footer); err = vhd_write_footer(vhd, &vhd->footer); if (err) return err; return 0; } int vhd_journal_close(vhd_journal_t *j) { if (j->jfd) close(j->jfd); vhd_close(&j->vhd); free(j->jname); return 0; } int vhd_journal_remove(vhd_journal_t *j) { int err; err = vhd_journal_enable_vhd(j); if (err) return err; if (j->jfd) { close(j->jfd); if (!j->is_block) unlink(j->jname); } vhd_close(&j->vhd); free(j->jname); return 0; } int vhd_journal_open(vhd_journal_t *j, const char *file, const char *jfile) { int err; vhd_context_t *vhd; memset(j, 0, sizeof(vhd_journal_t)); j->jfd = -1; vhd = &j->vhd; j->jname = strdup(jfile); if (j->jname == NULL) return -ENOMEM; j->jfd = open(j->jname, O_LARGEFILE | O_RDWR); if (j->jfd == -1) { err = -errno; goto fail; } err = vhd_test_file_fixed(j->jname, &j->is_block); if (err) goto fail; vhd->fd = open(file, O_LARGEFILE | O_RDWR | O_DIRECT); if (vhd->fd == -1) { err = -errno; goto fail; } err = vhd_test_file_fixed(file, &vhd->is_block); if (err) goto fail; err = vhd_journal_read_journal_header(j, &j->header); if (err) goto fail; err = vhd_journal_restore_metadata(j); if (err) goto fail; close(vhd->fd); free(vhd->bat.bat); free(vhd->batmap.map); err = vhd_open(vhd, file, VHD_OPEN_RDWR); if (err) goto fail; err = vhd_get_bat(vhd); if (err) goto fail; if (vhd_has_batmap(vhd)) { err = vhd_get_batmap(vhd); if (err) goto fail; } err = vhd_journal_disable_vhd(j); if (err) goto fail; return 0; fail: vhd_journal_close(j); return err; } int vhd_journal_create(vhd_journal_t *j, const char *file, const char *jfile) { int err; memset(j, 0, sizeof(vhd_journal_t)); j->jfd = -1; j->jname = strdup(jfile); if (j->jname == NULL) { err = -ENOMEM; goto fail1; } if (access(j->jname, F_OK) == 0) { err = vhd_test_file_fixed(j->jname, &j->is_block); if (err) goto fail1; if (!j->is_block) { err = -EEXIST; goto fail1; } } if (j->is_block) j->jfd = open(j->jname, O_LARGEFILE | O_RDWR, 0644); else j->jfd = open(j->jname, O_CREAT | O_TRUNC | O_LARGEFILE | O_RDWR, 0644); if (j->jfd == -1) { err = -errno; goto fail1; } err = vhd_open(&j->vhd, file, VHD_OPEN_RDWR | VHD_OPEN_STRICT); if (err) goto fail1; err = vhd_get_bat(&j->vhd); if (err) goto fail2; if (vhd_has_batmap(&j->vhd)) { err = vhd_get_batmap(&j->vhd); if (err) goto fail2; } err = vhd_journal_add_journal_header(j); if (err) goto fail2; err = vhd_journal_add_metadata(j); if (err) goto fail2; err = vhd_journal_disable_vhd(j); if (err) goto fail2; err = vhd_journal_sync(j); if (err) goto fail2; return 0; fail1: if (j->jfd != -1) { close(j->jfd); if (!j->is_block) unlink(j->jname); } free(j->jname); memset(j, 0, sizeof(vhd_journal_t)); return err; fail2: vhd_journal_remove(j); return err; } int vhd_journal_add_block(vhd_journal_t *j, uint32_t block, char mode) { int err; char *buf; off64_t off; size_t size; uint64_t blk; vhd_context_t *vhd; buf = NULL; vhd = &j->vhd; if (!vhd_type_dynamic(vhd)) return -EINVAL; err = vhd_get_bat(vhd); if (err) return err; if (block >= vhd->bat.entries) return -ERANGE; blk = vhd->bat.bat[block]; if (blk == DD_BLK_UNUSED) return 0; off = vhd_sectors_to_bytes(blk); if (mode & VHD_JOURNAL_METADATA) { size = vhd_sectors_to_bytes(vhd->bm_secs); err = vhd_read_bitmap(vhd, block, &buf); if (err) return err; err = vhd_journal_update(j, off, buf, size, VHD_JOURNAL_ENTRY_TYPE_DATA); free(buf); if (err) return err; } if (mode & VHD_JOURNAL_DATA) { off += vhd_sectors_to_bytes(vhd->bm_secs); size = vhd_sectors_to_bytes(vhd->spb); err = vhd_read_block(vhd, block, &buf); if (err) return err; err = vhd_journal_update(j, off, buf, size, VHD_JOURNAL_ENTRY_TYPE_DATA); free(buf); if (err) return err; } return vhd_journal_sync(j); } /* * commit indicates the transaction completed * successfully and we can remove the undo log */ int vhd_journal_commit(vhd_journal_t *j) { int err; j->header.journal_data_entries = 0; j->header.journal_metadata_entries = 0; j->header.journal_data_offset = 0; j->header.journal_metadata_offset = 0; err = vhd_journal_write_header(j, &j->header); if (err) return err; if (!j->is_block) err = vhd_journal_truncate(j, sizeof(vhd_journal_header_t)); if (err) return -errno; return 0; } /* * revert indicates the transaction failed * and we should revert any changes via the undo log */ int vhd_journal_revert(vhd_journal_t *j) { int i, err; char *file; void *buf; vhd_context_t *vhd; vhd_journal_entry_t entry; err = 0; vhd = &j->vhd; buf = NULL; file = strdup(vhd->file); if (!file) return -ENOMEM; vhd_close(&j->vhd); j->vhd.fd = open(file, O_RDWR | O_DIRECT | O_LARGEFILE); if (j->vhd.fd == -1) { free(file); return -errno; } err = vhd_test_file_fixed(file, &vhd->is_block); if (err) { free(file); return err; } err = vhd_journal_restore_metadata(j); if (err) { free(file); return err; } close(vhd->fd); free(vhd->bat.bat); free(vhd->batmap.map); err = vhd_open(vhd, file, VHD_OPEN_RDWR); free(file); if (err) return err; err = vhd_journal_seek(j, j->header.journal_data_offset, SEEK_SET); if (err) return err; for (i = 0; i < j->header.journal_data_entries; i++) { err = vhd_journal_read_entry(j, &entry); if (err) goto end; err = posix_memalign(&buf, VHD_SECTOR_SIZE, entry.size); if (err) { err = -err; buf = NULL; goto end; } err = vhd_journal_read(j, buf, entry.size); if (err) goto end; err = vhd_journal_validate_entry_data(&entry, buf); if (err) goto end; err = vhd_seek(vhd, entry.offset, SEEK_SET); if (err) goto end; err = vhd_write(vhd, buf, entry.size); if (err) goto end; err = 0; end: free(buf); buf = NULL; if (err) break; } if (err) return err; if (!vhd->is_block) { err = ftruncate(vhd->fd, j->header.vhd_footer_offset + sizeof(vhd_footer_t)); if (err) return -errno; } return vhd_journal_sync(j); } blktap-2.0.90/vhd/lib/libvhd-index.c0000644000000000000000000005706511664745551015660 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include "libvhd.h" #include "libvhd-index.h" #include "relative-path.h" typedef struct vhdi_path vhdi_path_t; typedef struct vhdi_header vhdi_header_t; typedef struct vhdi_bat_header vhdi_bat_header_t; typedef struct vhdi_file_table_header vhdi_file_table_header_t; typedef struct vhdi_file_table_entry vhdi_file_table_entry_t; static const char VHDI_HEADER_COOKIE[] = "vhdindex"; static const char VHDI_BAT_HEADER_COOKIE[] = "vhdi-bat"; static const char VHDI_FILE_TABLE_HEADER_COOKIE[] = "vhdifile"; struct vhdi_path { char path[VHD_MAX_NAME_LEN]; uint16_t bytes; }; struct vhdi_header { char cookie[8]; uint32_t vhd_block_size; uint64_t table_offset; }; struct vhdi_bat_header { char cookie[8]; uint64_t vhd_blocks; uint32_t vhd_block_size; vhdi_path_t vhd_path; vhdi_path_t index_path; vhdi_path_t file_table_path; uint64_t table_offset; }; struct vhdi_file_table_header { char cookie[8]; uint32_t files; uint64_t table_offset; }; struct vhdi_file_table_entry { vhdi_path_t p; vhdi_file_id_t file_id; uuid_t vhd_uuid; uint32_t vhd_timestamp; }; static inline int vhdi_seek(vhdi_context_t *ctx, off64_t off, int whence) { int err; err = lseek64(ctx->fd, off, whence); if (err == (off64_t)-1) return -errno; return 0; } static inline off64_t vhdi_position(vhdi_context_t *ctx) { return lseek64(ctx->fd, 0, SEEK_CUR); } static inline int vhdi_read(vhdi_context_t *ctx, void *buf, size_t size) { int err; err = read(ctx->fd, buf, size); if (err != size) return (errno ? -errno : -EIO); return 0; } static inline int vhdi_write(vhdi_context_t *ctx, void *buf, size_t size) { int err; err = write(ctx->fd, buf, size); if (err != size) return (errno ? -errno : -EIO); return 0; } static inline int vhdi_check_block_size(uint32_t block_size) { int i, cnt; cnt = 0; for (i = 0; i < 32; i++) if ((block_size >> i) & 0x0001) cnt++; if (cnt == 1) return 0; return -EINVAL; } static inline void vhdi_header_in(vhdi_header_t *header) { BE32_IN(&header->vhd_block_size); BE64_IN(&header->table_offset); } static inline void vhdi_header_out(vhdi_header_t *header) { BE32_OUT(&header->vhd_block_size); BE64_OUT(&header->table_offset); } static inline int vhdi_header_validate(vhdi_header_t *header) { if (memcmp(header->cookie, VHDI_HEADER_COOKIE, sizeof(header->cookie))) return -EINVAL; return vhdi_check_block_size(header->vhd_block_size); } void vhdi_entry_in(vhdi_entry_t *entry) { BE32_IN(&entry->file_id); BE32_IN(&entry->offset); } static inline vhdi_entry_t vhdi_entry_out(vhdi_entry_t *entry) { vhdi_entry_t e; e = *entry; BE32_OUT(&e.file_id); BE32_OUT(&e.offset); return e; } static inline void vhdi_path_in(vhdi_path_t *path) { BE16_IN(&path->bytes); } static inline void vhdi_path_out(vhdi_path_t *path) { BE16_OUT(&path->bytes); } static inline void vhdi_bat_header_in(vhdi_bat_header_t *header) { BE64_IN(&header->vhd_blocks); BE32_IN(&header->vhd_block_size); vhdi_path_in(&header->vhd_path); vhdi_path_in(&header->index_path); vhdi_path_in(&header->file_table_path); BE64_IN(&header->table_offset); } static inline void vhdi_bat_header_out(vhdi_bat_header_t *header) { BE64_OUT(&header->vhd_blocks); BE32_OUT(&header->vhd_block_size); vhdi_path_out(&header->vhd_path); vhdi_path_out(&header->index_path); vhdi_path_out(&header->file_table_path); BE64_OUT(&header->table_offset); } static inline int vhdi_path_validate(vhdi_path_t *path) { int i; if (path->bytes >= VHD_MAX_NAME_LEN - 1) return -ENAMETOOLONG; for (i = 0; i < path->bytes; i++) if (path->path[i] == '\0') return 0; return -EINVAL; } static inline char * vhdi_path_expand(const char *src, vhdi_path_t *dest, int *err) { int len; char *path, *base, copy[VHD_MAX_NAME_LEN]; char *absolute_path, __absolute_path[PATH_MAX]; strcpy(copy, src); base = dirname(copy); *err = asprintf(&path, "%s/%s", base, dest->path); if (*err == -1) { *err = -ENOMEM; return NULL; } absolute_path = realpath(path, __absolute_path); free(path); if (absolute_path) absolute_path = strdup(absolute_path); if (!absolute_path) { *err = -errno; return NULL; } len = strnlen(absolute_path, VHD_MAX_NAME_LEN - 1); if (len == VHD_MAX_NAME_LEN - 1) { free(absolute_path); *err = -ENAMETOOLONG; return NULL; } *err = 0; return absolute_path; } static inline int vhdi_bat_header_validate(vhdi_bat_header_t *header) { int err; if (memcmp(header->cookie, VHDI_BAT_HEADER_COOKIE, sizeof(header->cookie))) return -EINVAL; err = vhdi_check_block_size(header->vhd_block_size); if (err) return err; err = vhdi_path_validate(&header->vhd_path); if (err) return err; err = vhdi_path_validate(&header->index_path); if (err) return err; err = vhdi_path_validate(&header->file_table_path); if (err) return err; return 0; } static inline int vhdi_bat_load_header(int fd, vhdi_bat_header_t *header) { int err; err = lseek64(fd, 0, SEEK_SET); if (err == (off64_t)-1) return -errno; err = read(fd, header, sizeof(vhdi_bat_header_t)); if (err != sizeof(vhdi_bat_header_t)) return (errno ? -errno : -EIO); vhdi_bat_header_in(header); return vhdi_bat_header_validate(header); } static inline void vhdi_file_table_header_in(vhdi_file_table_header_t *header) { BE32_OUT(&header->files); BE64_OUT(&header->table_offset); } static inline void vhdi_file_table_header_out(vhdi_file_table_header_t *header) { BE32_OUT(&header->files); BE64_OUT(&header->table_offset); } static inline int vhdi_file_table_header_validate(vhdi_file_table_header_t *header) { if (memcmp(header->cookie, VHDI_FILE_TABLE_HEADER_COOKIE, sizeof(header->cookie))) return -EINVAL; return 0; } static inline int vhdi_file_table_load_header(int fd, vhdi_file_table_header_t *header) { int err; err = lseek64(fd, 0, SEEK_SET); if (err == (off64_t)-1) return -errno; err = read(fd, header, sizeof(vhdi_file_table_header_t)); if (err != sizeof(vhdi_file_table_header_t)) return (errno ? -errno : -EIO); vhdi_file_table_header_in(header); return vhdi_file_table_header_validate(header); } static inline int vhdi_file_table_write_header(int fd, vhdi_file_table_header_t *header) { int err; err = lseek64(fd, 0, SEEK_SET); if (err == (off64_t)-1) return -errno; err = vhdi_file_table_header_validate(header); if (err) return err; vhdi_file_table_header_out(header); err = write(fd, header, sizeof(vhdi_file_table_header_t)); if (err != sizeof(vhdi_file_table_header_t)) return (errno ? -errno : -EIO); return 0; } static inline void vhdi_file_table_entry_in(vhdi_file_table_entry_t *entry) { vhdi_path_in(&entry->p); BE32_IN(&entry->file_id); BE32_IN(&entry->vhd_timestamp); } static inline void vhdi_file_table_entry_out(vhdi_file_table_entry_t *entry) { vhdi_path_out(&entry->p); BE32_OUT(&entry->file_id); BE32_OUT(&entry->vhd_timestamp); } static inline int vhdi_file_table_entry_validate(vhdi_file_table_entry_t *entry) { return vhdi_path_validate(&entry->p); } static inline int vhdi_file_table_entry_validate_vhd(vhdi_file_table_entry_t *entry, const char *path) { int err; vhd_context_t vhd; struct stat stats; err = stat(path, &stats); if (err == -1) return -errno; if (entry->vhd_timestamp != vhd_time(stats.st_mtime)) return -EINVAL; err = vhd_open(&vhd, path, VHD_OPEN_RDONLY); if (err) return err; err = vhd_get_footer(&vhd); if (err) goto out; if (uuid_compare(entry->vhd_uuid, vhd.footer.uuid)) { err = -EINVAL; goto out; } out: vhd_close(&vhd); return err; } int vhdi_create(const char *name, uint32_t vhd_block_size) { void *buf; int err, fd; size_t size; vhdi_header_t header; memset(&header, 0, sizeof(vhdi_header_t)); err = vhdi_check_block_size(vhd_block_size); if (err) return err; err = access(name, F_OK); if (!err || errno != ENOENT) return (err ? err : -EEXIST); memcpy(header.cookie, VHDI_HEADER_COOKIE, sizeof(header.cookie)); header.vhd_block_size = vhd_block_size; header.table_offset = vhd_bytes_padded(sizeof(vhdi_header_t)); err = vhdi_header_validate(&header); if (err) return err; vhdi_header_out(&header); size = vhd_bytes_padded(sizeof(vhdi_header_t)); err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) return -err; memset(buf, 0, size); memcpy(buf, &header, sizeof(vhdi_header_t)); fd = open(name, O_CREAT | O_TRUNC | O_RDWR, 0600); if (fd == -1) return -errno; err = write(fd, buf, size); if (err != size) { err = (errno ? -errno : -EIO); goto fail; } close(fd); free(buf); return 0; fail: close(fd); free(buf); unlink(name); return err; } int vhdi_open(vhdi_context_t *ctx, const char *file, int flags) { int err, fd; size_t size; char *name; void *buf; vhdi_header_t header; buf = NULL; memset(ctx, 0, sizeof(vhdi_context_t)); name = strdup(file); if (!name) return -ENOMEM; fd = open(file, flags | O_LARGEFILE); if (fd == -1) { free(name); return -errno; } size = vhd_bytes_padded(sizeof(vhdi_header_t)); err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) { err = -err; goto fail; } err = read(fd, buf, size); if (err != size) { err = (errno ? -errno : -EIO); goto fail; } memcpy(&header, buf, sizeof(vhdi_header_t)); free(buf); buf = NULL; vhdi_header_in(&header); err = vhdi_header_validate(&header); if (err) goto fail; ctx->fd = fd; ctx->name = name; ctx->spb = header.vhd_block_size >> VHD_SECTOR_SHIFT; ctx->vhd_block_size = header.vhd_block_size; return 0; fail: close(fd); free(buf); free(name); return err; } void vhdi_close(vhdi_context_t *ctx) { close(ctx->fd); free(ctx->name); } int vhdi_read_block(vhdi_context_t *ctx, vhdi_block_t *block, uint32_t sector) { int i, err; size_t size; void *tab; err = vhdi_seek(ctx, vhd_sectors_to_bytes(sector), SEEK_SET); if (err) return err; size = vhd_bytes_padded(ctx->spb * sizeof(vhdi_entry_t)); block->entries = ctx->spb; err = posix_memalign(&tab, VHD_SECTOR_SIZE, size); if (err) return -err; block->table = tab; err = vhdi_read(ctx, block->table, size); if (err) goto fail; for (i = 0; i < block->entries; i++) vhdi_entry_in(&block->table[i]); return 0; fail: free(block->table); return err; } int vhdi_write_block(vhdi_context_t *ctx, vhdi_block_t *block, uint32_t sector) { void *buf; int i, err; size_t size; vhdi_entry_t *entries; err = vhdi_seek(ctx, vhd_sectors_to_bytes(sector), SEEK_SET); if (err) return err; size = vhd_bytes_padded(ctx->spb * sizeof(vhdi_entry_t)); err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) return -err; memset(buf, 0, size); entries = (vhdi_entry_t *)buf; for (i = 0; i < block->entries; i++) entries[i] = vhdi_entry_out(&block->table[i]); err = vhdi_write(ctx, entries, size); if (err) goto out; err = 0; out: free(entries); return err; } int vhdi_append_block(vhdi_context_t *ctx, vhdi_block_t *block, uint32_t *sector) { void *buf; int i, err; off64_t off; size_t size; vhdi_entry_t *entries; err = vhdi_seek(ctx, 0, SEEK_END); if (err) return err; off = vhd_bytes_padded(vhdi_position(ctx)); err = vhdi_seek(ctx, off, SEEK_SET); if (err) return err; size = vhd_bytes_padded(block->entries * sizeof(vhdi_entry_t)); err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) return -err; memset(buf, 0, size); entries = buf; for (i = 0; i < block->entries; i++) entries[i] = vhdi_entry_out(&block->table[i]); err = vhdi_write(ctx, entries, block->entries * sizeof(vhdi_entry_t)); if (err) goto out; err = 0; *sector = off >> VHD_SECTOR_SHIFT; out: if (err) { int gcc = ftruncate(ctx->fd, off); if (gcc) {} } free(entries); return err; } static int vhdi_copy_path_to(vhdi_path_t *path, const char *src, const char *dest) { int len, err; char *file, *relative_path, copy[VHD_MAX_NAME_LEN]; char *absolute_path, __absolute_path[PATH_MAX]; strcpy(copy, dest); file = basename(copy); absolute_path = realpath(copy, __absolute_path); relative_path = NULL; if (!absolute_path) { err = -errno; goto out; } if (!strcmp(file, "")) { err = -EINVAL; goto out; } relative_path = relative_path_to((char *)src, absolute_path, &err); if (!relative_path || err) { err = (err ? err : -EINVAL); goto out; } len = strnlen(relative_path, VHD_MAX_NAME_LEN - 1); if (len == VHD_MAX_NAME_LEN - 1) { err = -ENAMETOOLONG; goto out; } strcpy(path->path, relative_path); path->bytes = len + 1; err = 0; out: free(relative_path); return err; } int vhdi_bat_create(const char *name, const char *vhd, const char *index, const char *file_table) { int err, fd; off64_t off; vhd_context_t ctx; vhdi_bat_header_t header; memset(&header, 0, sizeof(vhdi_bat_header_t)); err = access(name, F_OK); if (!err || errno != ENOENT) return (err ? -err : -EEXIST); err = vhd_open(&ctx, vhd, VHD_OPEN_RDONLY); if (err) return err; err = vhd_get_header(&ctx); if (err) { vhd_close(&ctx); return err; } header.vhd_blocks = ctx.header.max_bat_size; header.vhd_block_size = ctx.header.block_size; vhd_close(&ctx); fd = open(name, O_CREAT | O_TRUNC | O_RDWR, 0600); if (fd == -1) return -errno; err = vhdi_copy_path_to(&header.vhd_path, name, vhd); if (err) goto fail; err = vhdi_copy_path_to(&header.index_path, name, index); if (err) goto fail; err = vhdi_copy_path_to(&header.file_table_path, name, file_table); if (err) goto fail; off = vhd_bytes_padded(sizeof(vhdi_bat_header_t)); header.table_offset = off; memcpy(header.cookie, VHDI_BAT_HEADER_COOKIE, sizeof(header.cookie)); err = vhdi_bat_header_validate(&header); if (err) goto fail; vhdi_bat_header_out(&header); err = write(fd, &header, sizeof(vhdi_bat_header_t)); if (err != sizeof(vhdi_bat_header_t)) { err = (errno ? -errno : -EIO); goto fail; } close(fd); return 0; fail: close(fd); unlink(name); return err; } int vhdi_bat_load(const char *name, vhdi_bat_t *bat) { char *path; int err, fd; size_t size; uint32_t *table; vhdi_bat_header_t header; table = NULL; fd = open(name, O_RDONLY | O_LARGEFILE); if (fd == -1) return -errno; err = vhdi_bat_load_header(fd, &header); if (err) goto out; size = header.vhd_blocks * sizeof(uint32_t); table = malloc(size); if (!table) { err = -ENOMEM; goto out; } err = lseek64(fd, header.table_offset, SEEK_SET); if (err == (off64_t)-1) { err = -errno; goto out; } err = read(fd, table, size); if (err != size) { err = (errno ? -errno : -EIO); goto out; } path = vhdi_path_expand(name, &header.vhd_path, &err); if (err) goto out; strcpy(bat->vhd_path, path); free(path); err = access(bat->vhd_path, F_OK); if (err == -1) { err = -errno; goto out; } path = vhdi_path_expand(name, &header.index_path, &err); if (err) goto out; strcpy(bat->index_path, path); free(path); err = access(bat->index_path, F_OK); if (err == -1) { err = -errno; goto out; } path = vhdi_path_expand(name, &header.file_table_path, &err); if (err) goto out; strcpy(bat->file_table_path, path); free(path); err = access(bat->file_table_path, F_OK); if (err == -1) { err = -errno; goto out; } bat->vhd_blocks = header.vhd_blocks; bat->vhd_block_size = header.vhd_block_size; bat->table = table; err = 0; out: close(fd); if (err) { free(table); memset(bat, 0, sizeof(vhdi_bat_t)); } return err; } int vhdi_bat_write(const char *name, vhdi_bat_t *bat) { int err, fd; size_t size; vhdi_bat_header_t header; fd = open(name, O_RDWR | O_LARGEFILE); if (fd == -1) return -errno; err = vhdi_bat_load_header(fd, &header); if (err) goto out; if (header.vhd_blocks != bat->vhd_blocks || header.vhd_block_size != bat->vhd_block_size) { err = -EINVAL; goto out; } err = lseek64(fd, header.table_offset, SEEK_SET); if (err == (off64_t)-1) { err = -errno; goto out; } size = bat->vhd_blocks * sizeof(uint32_t); err = write(fd, bat->table, size); if (err != size) { err = (errno ? -errno : -EIO); goto out; } err = 0; out: close(fd); return err; } int vhdi_file_table_create(const char *file) { int err, fd; off64_t off; vhdi_file_table_header_t header; memset(&header, 0, sizeof(vhdi_file_table_header_t)); err = access(file, F_OK); if (!err || errno != ENOENT) return (err ? err : -EEXIST); off = vhd_bytes_padded(sizeof(vhdi_file_table_header_t)); header.files = 0; header.table_offset = off; memcpy(header.cookie, VHDI_FILE_TABLE_HEADER_COOKIE, sizeof(header.cookie)); vhdi_file_table_header_out(&header); fd = open(file, O_CREAT | O_TRUNC | O_RDWR, 0600); if (fd == -1) return -errno; err = write(fd, &header, sizeof(vhdi_file_table_header_t)); if (err != sizeof(vhdi_file_table_header_t)) { err = (errno ? -errno : -EIO); goto out; } err = 0; out: close(fd); return err; } int vhdi_file_table_load(const char *name, vhdi_file_table_t *table) { off64_t off; size_t size; int err, i, fd; vhdi_file_table_header_t header; vhdi_file_table_entry_t *entries; entries = NULL; fd = open(name, O_RDONLY | O_LARGEFILE); if (fd == -1) return -errno; err = vhdi_file_table_load_header(fd, &header); if (err) goto out; if (!header.files) goto out; table->table = calloc(header.files, sizeof(vhdi_file_ref_t)); if (!table->table) { err = -ENOMEM; goto out; } off = header.table_offset; err = lseek64(fd, off, SEEK_SET); if (err == (off64_t)-1) { err = -errno; goto out; } size = header.files * sizeof(vhdi_file_table_entry_t); entries = calloc(header.files, sizeof(vhdi_file_table_entry_t)); if (!entries) { err = -ENOMEM; goto out; } err = read(fd, entries, size); if (err != size) { err = (errno ? -errno : -EIO); goto out; } for (i = 0; i < header.files; i++) { vhdi_file_table_entry_t *entry; entry = entries + i; vhdi_file_table_entry_in(entry); err = vhdi_file_table_entry_validate(entry); if (err) goto out; table->table[i].path = vhdi_path_expand(name, &entry->p, &err); if (err) goto out; err = vhdi_file_table_entry_validate_vhd(entry, table->table[i].path); if (err) goto out; table->table[i].file_id = entry->file_id; table->table[i].vhd_timestamp = entry->vhd_timestamp; uuid_copy(table->table[i].vhd_uuid, entry->vhd_uuid); } err = 0; table->entries = header.files; out: close(fd); free(entries); if (err) { if (table->table) { for (i = 0; i < header.files; i++) free(table->table[i].path); free(table->table); } memset(table, 0, sizeof(vhdi_file_table_t)); } return err; } static int vhdi_file_table_next_fid(const char *name, const char *file, vhdi_file_id_t *fid) { int i, err; char *path, __path[PATH_MAX]; vhdi_file_id_t max; vhdi_file_table_t files; max = 0; path = NULL; err = vhdi_file_table_load(name, &files); if (err) return err; path = realpath(file, __path); if (!path) { err = -errno; goto out; } for (i = 0; i < files.entries; i++) { if (!strcmp(path, files.table[i].path)) { err = -EEXIST; goto out; } max = MAX(max, files.table[i].file_id); } *fid = max + 1; err = 0; out: vhdi_file_table_free(&files); return err; } static inline int vhdi_file_table_entry_initialize(vhdi_file_table_entry_t *entry, const char *file_table, const char *file, vhdi_file_id_t fid) { int err; struct stat stats; vhd_context_t vhd; memset(entry, 0, sizeof(vhdi_file_table_entry_t)); err = stat(file, &stats); if (err == -1) return -errno; entry->file_id = fid; entry->vhd_timestamp = vhd_time(stats.st_mtime); err = vhd_open(&vhd, file, VHD_OPEN_RDONLY); if (err) goto out; err = vhd_get_footer(&vhd); if (err) { vhd_close(&vhd); goto out; } uuid_copy(entry->vhd_uuid, vhd.footer.uuid); vhd_close(&vhd); err = vhdi_copy_path_to(&entry->p, file_table, file); if (err) goto out; err = 0; out: if (err) memset(entry, 0, sizeof(vhdi_file_table_entry_t)); return err; } int vhdi_file_table_add(const char *name, const char *file, vhdi_file_id_t *_fid) { off64_t off; size_t size; vhdi_file_id_t fid; int err, fd, len; vhdi_file_table_entry_t entry; vhdi_file_table_header_t header; off = 0; fid = 0; *_fid = 0; len = strnlen(file, VHD_MAX_NAME_LEN - 1); if (len == VHD_MAX_NAME_LEN - 1) return -ENAMETOOLONG; err = vhdi_file_table_next_fid(name, file, &fid); if (err) return err; fd = open(name, O_RDWR | O_LARGEFILE); if (fd == -1) return -errno; err = vhdi_file_table_load_header(fd, &header); if (err) goto out; size = sizeof(vhdi_file_table_entry_t); off = header.table_offset + size * header.files; err = lseek64(fd, off, SEEK_SET); if (err == (off64_t)-1) { err = -errno; goto out; } err = vhdi_file_table_entry_initialize(&entry, name, file, fid); if (err) goto out; vhdi_file_table_entry_out(&entry); err = write(fd, &entry, size); if (err != size) { err = (errno ? -errno : -EIO); goto out; } header.files++; err = vhdi_file_table_write_header(fd, &header); if (err) goto out; err = 0; *_fid = fid; out: if (err && off) { int gcc = ftruncate(fd, off); if (gcc) {}; } close(fd); return err; } void vhdi_file_table_free(vhdi_file_table_t *table) { int i; if (table->table) { for (i = 0; i < table->entries; i++) free(table->table[i].path); free(table->table); } memset(table, 0, sizeof(vhdi_file_table_t)); } blktap-2.0.90/vhd/Makefile.am0000644000000000000000000000041311664745551014406 0ustar rootroot SUBDIRS = lib AM_CFLAGS = -Wall AM_CFLAGS += -Werror AM_CPPFLAGS = -D_GNU_SOURCE AM_CPPFLAGS += -I$(top_srcdir)/include bin_PROGRAMS = vhd-util bin_PROGRAMS += vhd-index bin_PROGRAMS += vhd-update LDADD = lib/libvhd.la vhd_index_LDADD = lib/libvhd.la -luuid blktap-2.0.90/vhd/vhd-update.c0000644000000000000000000001425411664745551014567 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Before updating a VHD file, we create a journal consisting of: * - all data at the beginning of the file, up to and including the BAT * - each allocated bitmap (existing at the same offset in the journal as * its corresponding bitmap in the original file) * Updates are performed in place by writing appropriately * transformed versions of journaled bitmaps to the original file. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include "libvhd.h" #include "libvhd-journal.h" static void usage(void) { printf("usage: vhd-update <-n name> [-j existing journal] [-h]\n"); exit(EINVAL); } /* * update vhd creator version to reflect its new bitmap ordering */ static inline int update_creator_version(vhd_journal_t *journal) { journal->vhd.footer.crtr_ver = VHD_VERSION(1, 1); return vhd_write_footer(&journal->vhd, &journal->vhd.footer); } static int journal_bitmaps(vhd_journal_t *journal) { int i, err; for (i = 0; i < journal->vhd.bat.entries; i++) { err = vhd_journal_add_block(journal, i, VHD_JOURNAL_METADATA); if (err) return err; } return 0; } /* * older VHD bitmaps were little endian * and bits within a word were set from right to left */ static inline int old_test_bit(int nr, volatile void * addr) { return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >> (nr % (sizeof(unsigned long)*8))) & 1; } /* * new VHD bitmaps are big endian * and bits within a word are set from left to right */ #define BIT_MASK 0x80 static inline void new_set_bit (int nr, volatile char *addr) { addr[nr >> 3] |= (BIT_MASK >> (nr & 7)); } static void convert_bitmap(char *in, char *out, int bytes) { int i; memset(out, 0, bytes); for (i = 0; i < bytes << 3; i++) if (old_test_bit(i, (void *)in)) new_set_bit(i, out); } static int update_vhd(vhd_journal_t *journal, int rollback) { int i, err; size_t size; char *buf; void *converted; buf = NULL; converted = NULL; size = vhd_bytes_padded(journal->vhd.spb / 8); err = posix_memalign(&converted, 512, size); if (err) { converted = NULL; goto out; } for (i = 0; i < journal->vhd.bat.entries; i++) { if (journal->vhd.bat.bat[i] == DD_BLK_UNUSED) continue; err = vhd_read_bitmap(&journal->vhd, i, &buf); if (err) goto out; if (rollback) memcpy(converted, buf, size); else convert_bitmap(buf, converted, size); free(buf); err = vhd_write_bitmap(&journal->vhd, i, converted); if (err) goto out; } err = 0; out: free(converted); return err; } static int open_journal(vhd_journal_t *journal, const char *file, const char *jfile) { int err; err = vhd_journal_create(journal, file, jfile); if (err) { printf("error creating journal for %s: %d\n", file, err); return err; } return 0; } static int close_journal(vhd_journal_t *journal, int err) { if (err) err = vhd_journal_revert(journal); else err = vhd_journal_commit(journal); if (err) return vhd_journal_close(journal); else return vhd_journal_remove(journal); } int main(int argc, char **argv) { char *file, *jfile; int c, err, rollback; vhd_journal_t journal; file = NULL; jfile = NULL; rollback = 0; while ((c = getopt(argc, argv, "n:j:rh")) != -1) { switch(c) { case 'n': file = optarg; break; case 'j': jfile = optarg; err = access(jfile, R_OK); if (err == -1) { printf("invalid journal arg %s\n", jfile); return -errno; } break; case 'r': /* add a rollback option for debugging which * pushes journalled bitmaps to original file * without transforming them */ rollback = 1; break; default: usage(); } } if (!file) usage(); if (rollback && !jfile) { printf("rollback requires a journal argument\n"); usage(); } err = open_journal(&journal, file, jfile); if (err) return err; if (!vhd_creator_tapdisk(&journal.vhd) || journal.vhd.footer.crtr_ver != VHD_VERSION(0, 1) || journal.vhd.footer.type == HD_TYPE_FIXED) { err = 0; goto out; } err = journal_bitmaps(&journal); if (err) { /* no changes to vhd file yet, * so close the journal and bail */ vhd_journal_close(&journal); return err; } err = update_vhd(&journal, rollback); if (err) { printf("update failed: %d; saving journal\n", err); goto out; } err = update_creator_version(&journal); if (err) { printf("failed to udpate creator version: %d\n", err); goto out; } err = 0; out: err = close_journal(&journal, err); return err; } blktap-2.0.90/vhd/vhd-index.c0000644000000000000000000004573311664745551014422 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include "libvhd.h" #include "libvhd-index.h" static void usage(void) { printf("usage: vhd-index \n" "commands:\n" "\t index: <-i index name> <-v vhd file>\n" "\t summary: <-s index name> [-v vhd file [-b block]]\n"); exit(-EINVAL); } typedef struct vhdi_name vhdi_name_t; struct vhdi_name { char *vhd; char *bat; char *base; char *index; char *files; }; static int vhd_index_get_name(const char *index, const char *vhd, vhdi_name_t *name) { int err, len; memset(name, 0, sizeof(vhdi_name_t)); len = strnlen(index, VHD_MAX_NAME_LEN); if (len + 5 >= VHD_MAX_NAME_LEN - 1) return -ENAMETOOLONG; if (vhd) { len = strnlen(vhd, VHD_MAX_NAME_LEN); if (len >= VHD_MAX_NAME_LEN - 1) return -ENAMETOOLONG; err = asprintf(&name->vhd, "%s", vhd); if (err == -1) { name->vhd = NULL; goto fail; } err = asprintf(&name->bat, "%s.bat", vhd); if (err == -1) { name->bat = NULL; goto fail; } } err = asprintf(&name->base, "%s", index); if (err == -1) { name->base = NULL; goto fail; } err = asprintf(&name->index, "%s.index", index); if (err == -1) { name->index = NULL; goto fail; } err = asprintf(&name->files, "%s.files", index); if (err == -1) { name->files = NULL; goto fail; } return 0; fail: free(name->vhd); free(name->bat); free(name->base); free(name->index); free(name->files); return -ENOMEM; } static inline void vhd_index_free_name(vhdi_name_t *name) { free(name->vhd); free(name->bat); free(name->base); free(name->index); free(name->files); } static inline int vhd_index_add_file_table_entry(vhdi_name_t *name, const char *file, vhdi_file_table_t *files, vhdi_file_id_t *fid) { int err; vhdi_file_table_free(files); err = vhdi_file_table_add(name->files, file, fid); if (err) return err; return vhdi_file_table_load(name->files, files); } static inline int vhd_index_get_file_id(vhdi_name_t *name, const char *file, vhdi_file_table_t *files, vhdi_file_id_t *fid) { char *path, __path[PATH_MAX]; int i; path = realpath(file, __path); if (!path) return -errno; for (i = 0; i < files->entries; i++) if (!strcmp(files->table[i].path, path)) { *fid = files->table[i].file_id; return 0; } return vhd_index_add_file_table_entry(name, file, files, fid); } static inline int vhd_index_get_block(vhdi_context_t *vhdi, vhd_context_t *vhd, uint32_t block, vhdi_block_t *vhdi_block) { int i; if (block) return vhdi_read_block(vhdi, vhdi_block, block); vhdi_block->entries = vhd->spb; vhdi_block->table = calloc(vhd->spb, sizeof(vhdi_entry_t)); if (!vhdi_block->table) return -ENOMEM; for (i = 0; i < vhdi_block->entries; i++) vhdi_block->table[i].offset = DD_BLK_UNUSED; return 0; } static int vhd_index_add_bat_entry(vhdi_name_t *name, vhdi_context_t *vhdi, vhdi_bat_t *bat, vhdi_file_table_t *files, vhd_context_t *vhd, uint32_t block, char *finished) { char *map; vhdi_file_id_t fid; uint32_t i, count, off; vhdi_block_t vhdi_block; int err, update, append; fid = 0; count = 0; update = 0; append = (bat->table[block] == 0); if (vhd->bat.bat[block] == DD_BLK_UNUSED) return 0; err = vhd_index_get_block(vhdi, vhd, bat->table[block], &vhdi_block); if (err) return err; err = vhd_read_bitmap(vhd, block, &map); if (err) goto out; err = vhd_index_get_file_id(name, vhd->file, files, &fid); if (err) goto out; for (i = 0; i < vhd->spb; i++) { if (vhdi_block.table[i].file_id) { count++; continue; } if (!vhd_bitmap_test(vhd, map, i)) continue; err = vhd_offset(vhd, (uint64_t)block * vhd->spb + i, &off); if (err) goto out; vhdi_block.table[i].file_id = fid; vhdi_block.table[i].offset = off; count++; update++; } if (update) { if (append) { uint32_t location; err = vhdi_append_block(vhdi, &vhdi_block, &location); if (err) goto out; bat->table[block] = location; } else { err = vhdi_write_block(vhdi, &vhdi_block, bat->table[block]); if (err) goto out; } } if (count == vhd->spb) *finished = 1; err = 0; out: free(vhdi_block.table); free(map); return err; } static int vhd_index_clone_bat_entry(vhdi_name_t *name, vhdi_context_t *vhdi, vhdi_bat_t *bat, vhdi_file_table_t *files, vhd_context_t *vhd, uint32_t block) { char *map; int err, update; uint32_t i, off; vhdi_file_id_t fid; vhdi_block_t vhdi_block; fid = 0; update = 0; if (vhd->bat.bat[block] == DD_BLK_UNUSED) return 0; err = vhd_index_get_block(vhdi, vhd, bat->table[block], &vhdi_block); if (err) return err; err = vhd_read_bitmap(vhd, block, &map); if (err) goto out; err = vhd_index_get_file_id(name, vhd->file, files, &fid); if (err) goto out; for (i = 0; i < vhd->spb; i++) { if (!vhd_bitmap_test(vhd, map, i)) continue; err = vhd_offset(vhd, (uint64_t)block * vhd->spb + i, &off); if (err) goto out; vhdi_block.table[i].file_id = fid; vhdi_block.table[i].offset = off; update++; } if (update) { uint32_t location; err = vhdi_append_block(vhdi, &vhdi_block, &location); if (err) goto out; bat->table[block] = location; } err = 0; out: free(vhdi_block.table); free(map); return err; } static int vhd_index_update_bat_entry(vhdi_name_t *name, vhdi_context_t *vhdi, vhdi_bat_t *bat, vhdi_file_table_t *files, vhd_context_t *vhd, uint32_t block) { char *map; int err, update; uint32_t i, off; vhdi_file_id_t fid; vhdi_block_t vhdi_block; fid = 0; update = 0; if (vhd->bat.bat[block] == DD_BLK_UNUSED) return 0; err = vhd_index_get_block(vhdi, vhd, bat->table[block], &vhdi_block); if (err) return err; err = vhd_read_bitmap(vhd, block, &map); if (err) goto out; err = vhd_index_get_file_id(name, vhd->file, files, &fid); if (err) goto out; for (i = 0; i < vhd->spb; i++) { if (!vhd_bitmap_test(vhd, map, i)) continue; err = vhd_offset(vhd, (uint64_t)block * vhd->spb + i, &off); if (err) goto out; if (vhdi_block.table[i].file_id == fid && vhdi_block.table[i].offset == off) continue; vhdi_block.table[i].file_id = fid; vhdi_block.table[i].offset = off; update++; } if (update) { uint32_t location; err = vhdi_append_block(vhdi, &vhdi_block, &location); if (err) goto out; bat->table[block] = location; } err = 0; out: free(vhdi_block.table); free(map); return err; } static int vhd_index_add_bat(vhdi_name_t *name, uint64_t vhd_blocks, uint32_t vhd_block_size) { int err; vhdi_bat_t bat; vhd_context_t vhd; vhdi_context_t vhdi; vhdi_file_table_t files; char *vhd_file, *finished; uint32_t block, remaining; memset(&bat, 0, sizeof(vhdi_bat_t)); memset(&files, 0, sizeof(vhdi_file_table_t)); vhd_file = NULL; finished = NULL; bat.vhd_blocks = vhd_blocks; bat.vhd_block_size = vhd_block_size; strcpy(bat.vhd_path, name->vhd); strcpy(bat.index_path, name->index); strcpy(bat.file_table_path, name->files); err = vhdi_open(&vhdi, name->index, O_RDWR); if (err) return err; err = vhdi_file_table_load(name->files, &files); if (err) { vhdi_close(&vhdi); return err; } err = vhdi_bat_create(name->bat, name->vhd, name->index, name->files); if (err) goto out; bat.table = calloc(vhd_blocks, sizeof(uint32_t)); if (!bat.table) { err = -ENOMEM; goto out; } vhd_file = strdup(name->vhd); if (!vhd_file) goto out; remaining = vhd_blocks; finished = calloc(remaining, sizeof(char)); if (!finished) { err = -ENOMEM; goto out; } for (;;) { err = vhd_open(&vhd, vhd_file, VHD_OPEN_RDONLY); if (err) goto out; err = vhd_get_bat(&vhd); if (err) goto out_vhd; for (block = 0; block < vhd.bat.entries; block++) { if (finished[block]) continue; err = vhd_index_add_bat_entry(name, &vhdi, &bat, &files, &vhd, block, &finished[block]); if (err) goto out_bat; if (finished[block]) remaining--; } free(vhd_file); vhd_file = NULL; if (!remaining || vhd.footer.type != HD_TYPE_DIFF) { vhd_put_bat(&vhd); vhd_close(&vhd); break; } err = vhd_parent_locator_get(&vhd, &vhd_file); if (err) goto out_bat; out_bat: vhd_put_bat(&vhd); out_vhd: vhd_close(&vhd); if (err) goto out; } err = vhdi_bat_write(name->bat, &bat); if (err) goto out; err = 0; out: if (err) unlink(name->bat); vhdi_file_table_free(&files); vhdi_close(&vhdi); free(bat.table); free(finished); free(vhd_file); return err; } static int vhd_index_clone_bat(vhdi_name_t *name, const char *parent) { int err; char *pbat; uint32_t block; vhdi_bat_t bat; vhd_context_t vhd; vhdi_context_t vhdi; vhdi_file_table_t files; memset(&bat, 0, sizeof(vhdi_bat_t)); memset(&files, 0, sizeof(vhdi_file_table_t)); err = asprintf(&pbat, "%s.bat", parent); if (err == -1) return -ENOMEM; err = access(pbat, R_OK); if (err == -1) { free(pbat); return -errno; } err = vhdi_open(&vhdi, name->index, O_RDWR); if (err) goto out; err = vhdi_bat_load(pbat, &bat); if (err) goto out_vhdi; err = vhdi_file_table_load(name->files, &files); if (err) goto out_vhdi; err = vhdi_bat_create(name->bat, name->vhd, name->index, name->files); if (err) goto out_ft; err = vhdi_bat_write(name->bat, &bat); if (err) goto out_ft; err = vhd_open(&vhd, name->vhd, VHD_OPEN_RDONLY); if (err) goto out_ft; err = vhd_get_bat(&vhd); if (err) goto out_vhd; for (block = 0; block < vhd.bat.entries; block++) { err = vhd_index_clone_bat_entry(name, &vhdi, &bat, &files, &vhd, block); if (err) goto out_bat; } err = vhdi_bat_write(name->bat, &bat); if (err) goto out_bat; err = 0; out_bat: vhd_put_bat(&vhd); out_vhd: vhd_close(&vhd); out_ft: vhdi_file_table_free(&files); out_vhdi: vhdi_close(&vhdi); out: if (err) unlink(name->bat); free(bat.table); free(pbat); return err; } static int vhd_index_update_bat(vhdi_name_t *name) { int err; uint32_t block; vhdi_bat_t bat; vhd_context_t vhd; vhdi_context_t vhdi; vhdi_file_table_t files; memset(&bat, 0, sizeof(vhdi_bat_t)); memset(&files, 0, sizeof(vhdi_file_table_t)); err = access(name->bat, R_OK); if (err == -1) return -errno; err = vhdi_open(&vhdi, name->index, O_RDWR); if (err) goto out; err = vhdi_bat_load(name->bat, &bat); if (err) goto out_vhdi; err = vhdi_file_table_load(name->files, &files); if (err) goto out_vhdi; err = vhd_open(&vhd, name->vhd, VHD_OPEN_RDONLY); if (err) goto out_ft; err = vhd_get_bat(&vhd); if (err) goto out_vhd; for (block = 0; block < vhd.bat.entries; block++) { err = vhd_index_update_bat_entry(name, &vhdi, &bat, &files, &vhd, block); if (err) goto out_bat; } err = vhdi_bat_write(name->bat, &bat); if (err) goto out_bat; err = 0; out_bat: vhd_put_bat(&vhd); out_vhd: vhd_close(&vhd); out_ft: vhdi_file_table_free(&files); out_vhdi: vhdi_close(&vhdi); out: free(bat.table); return err; } static int vhd_index_create(vhdi_name_t *name) { int err; vhd_context_t ctx; uint32_t block_size; if (!access(name->index, F_OK) || !access(name->files, F_OK)) return -EEXIST; err = vhd_open(&ctx, name->vhd, VHD_OPEN_RDONLY); if (err) return err; err = vhd_get_header(&ctx); if (err) { vhd_close(&ctx); return err; } block_size = ctx.header.block_size; vhd_close(&ctx); err = vhdi_create(name->index, block_size); if (err) goto out; err = vhdi_file_table_create(name->files); if (err) goto out; err = 0; out: if (err) { unlink(name->index); unlink(name->files); } return err; } static int vhd_index(vhdi_name_t *name) { char *parent; vhd_context_t ctx; uint64_t vhd_blocks; uint32_t vhd_block_size; int err, new_index, new_bat; parent = NULL; new_bat = 0; new_index = 0; /* find vhd's parent -- we only index read-only vhds */ err = vhd_open(&ctx, name->vhd, VHD_OPEN_RDONLY); if (err) return err; err = vhd_parent_locator_get(&ctx, &parent); vhd_close(&ctx); if (err) return err; /* update name to point to parent */ free(name->vhd); name->vhd = parent; parent = NULL; free(name->bat); err = asprintf(&name->bat, "%s.bat", name->vhd); if (err == -1) { name->bat = NULL; return -ENOMEM; } /* create index if it doesn't already exist */ err = access(name->index, R_OK | W_OK); if (err == -1 && errno == ENOENT) { new_index = 1; err = vhd_index_create(name); } if (err) return err; /* get basic vhd info */ err = vhd_open(&ctx, name->vhd, VHD_OPEN_RDONLY); if (err) goto out; err = vhd_get_header(&ctx); if (err) { vhd_close(&ctx); goto out; } vhd_blocks = ctx.header.max_bat_size; vhd_block_size = ctx.header.block_size; if (vhd_parent_locator_get(&ctx, &parent)) parent = NULL; vhd_close(&ctx); /* update existing bat if it exists */ err = vhd_index_update_bat(name); if (err != -ENOENT) goto out; new_bat = 1; if (parent) { /* clone parent bat if it exists */ err = vhd_index_clone_bat(name, parent); if (err != -ENOENT) goto out; } /* create new bat from scratch */ err = vhd_index_add_bat(name, vhd_blocks, vhd_block_size); if (err) goto out; err = 0; out: if (err) { if (new_bat) unlink(name->bat); if (new_index) { unlink(name->index); unlink(name->files); } } free(parent); return err; } static void vhd_index_print_summary(vhdi_name_t *name, uint32_t block_size, vhdi_file_table_t *files) { int i; char time[26], uuid[37]; printf("VHD INDEX : %s\n", name->index); printf("--------------------\n"); printf("block size : %u\n", block_size); printf("files : %d\n", files->entries); printf("\n"); for (i = 0; i < files->entries; i++) { uuid_unparse(files->table[i].vhd_uuid, uuid); vhd_time_to_string(files->table[i].vhd_timestamp, time); printf(" fid 0x%04x : %s, %s, %s\n", files->table[i].file_id, files->table[i].path, uuid, time); } printf("\n"); } static inline void vhd_index_print_bat_header(const char *name, vhdi_bat_t *bat) { printf("VHD INDEX BAT : %s\n", name); printf("--------------------\n"); printf("blocks : %"PRIu64"\n", bat->vhd_blocks); printf("block size : %u\n", bat->vhd_block_size); printf("vhd path : %s\n", bat->vhd_path); printf("index path : %s\n", bat->index_path); printf("file table path : %s\n", bat->file_table_path); } static int vhd_index_print_vhd_summary(vhdi_name_t *name) { int err; uint32_t i; vhdi_bat_t bat; err = vhdi_bat_load(name->bat, &bat); if (err) return err; vhd_index_print_bat_header(name->bat, &bat); printf("\n"); for (i = 0; i < bat.vhd_blocks; i++) printf(" block 0x%04x : offset 0x%08x\n", i, bat.table[i]); free(bat.table); return 0; } static int vhd_index_print_vhd_block_summary(vhdi_name_t *name, uint32_t block) { int err; uint32_t i; uint32_t off; vhdi_bat_t bat; vhdi_context_t vhdi; vhdi_block_t vhdi_block; err = vhdi_bat_load(name->bat, &bat); if (err) return err; vhd_index_print_bat_header(name->bat, &bat); if (block > bat.vhd_blocks) { printf("block %u past end of bat (%"PRIu64")\n", block, bat.vhd_blocks); err = -EINVAL; goto out; } off = bat.table[block]; if (off == DD_BLK_UNUSED) { printf("block %u is unallocated\n", block); err = 0; goto out; } err = vhdi_open(&vhdi, name->index, O_RDWR); if (err) goto out; err = vhdi_read_block(&vhdi, &vhdi_block, off); vhdi_close(&vhdi); if (err) goto out; printf("\nBLOCK 0x%08x\n", block); for (i = 0; i < vhdi_block.entries; i++) printf(" sec 0x%04x : fid 0x%04x, offset 0x%08x\n", i, vhdi_block.table[i].file_id, vhdi_block.table[i].offset); free(vhdi_block.table); err = 0; out: free(bat.table); return err; } static int vhd_index_summary(vhdi_name_t *name, uint32_t block) { int err; uint32_t block_size; vhdi_context_t vhdi; vhdi_file_table_t files; err = vhdi_open(&vhdi, name->index, O_RDWR); if (err) return err; block_size = vhdi.vhd_block_size; vhdi_close(&vhdi); err = vhdi_file_table_load(name->files, &files); if (err) return err; vhd_index_print_summary(name, block_size, &files); if (name->vhd) { if (block == (uint32_t)-1) err = vhd_index_print_vhd_summary(name); else err = vhd_index_print_vhd_block_summary(name, block); if (err) goto out; } err = 0; out: vhdi_file_table_free(&files); return err; } int main(int argc, char *argv[]) { int err; uint32_t block; vhdi_name_t name; char *vhd, *index; int c, update, summary; vhd = NULL; index = NULL; block = (uint32_t)-1; update = 0; summary = 0; while ((c = getopt(argc, argv, "i:v:s:b:h")) != -1) { switch (c) { case 'i': index = optarg; update = 1; break; case 'v': vhd = optarg; break; case 's': index = optarg; summary = 1; break; case 'b': block = strtoul(optarg, NULL, 10); break; default: usage(); } } if (optind != argc) usage(); if (!(update ^ summary)) usage(); if (block != (uint32_t)-1 && (!summary || !vhd)) usage(); err = vhd_index_get_name(index, vhd, &name); if (err) goto out; if (summary) err = vhd_index_summary(&name, block); else if (update) { if (!vhd) usage(); err = vhd_index(&name); } out: vhd_index_free_name(&name); return -err; } blktap-2.0.90/Makefile.am0000644000000000000000000000026411664745551013631 0ustar rootroot # ACLOCAL_AMFLAGS = -I m4 SUBDIRS = lvm SUBDIRS += $(MAYBE_part) SUBDIRS += vhd SUBDIRS += control SUBDIRS += drivers SUBDIRS += include if ENABLE_PART MAYBE_part = part endif blktap-2.0.90/part/0000755000000000000000000000000011664745551012541 5ustar rootrootblktap-2.0.90/part/part-util.c0000644000000000000000000001475311664745551014640 0ustar rootroot #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "partition.h" #if BYTE_ORDER == LITTLE_ENDIAN #define cpu_to_le32(x) (x) #define cpu_to_le64(x) (x) #else #define cpu_to_le32(x) bswap_32(x) #define cpu_to_le64(x) bswap_64(x) #endif static void usage(const char *app) { printf("usage: %s <-i image> " "[-d dump] [-c count] [-f format] " "[-t type] [-s sig ]\n", app); } static void chs_unpack(struct partition_chs *c, uint8_t *head, uint8_t *sector, uint16_t *cylinder) { *head = c->chs[0]; *sector = c->chs[1] & 0x3f; *cylinder = (c->chs[1] & 0xc0) * 4 + c->chs[2]; } void partition_table_dump(struct partition_table *pt) { int i; printf("disk signature 0x%08x\n", pt->disk_signature); printf("mbr signature 0x%04x\n", pt->mbr_signature); printf("\n"); for (i = 0; i < 4; i++) { struct primary_partition *p = pt->partitions + i; uint8_t head, sector; uint16_t cylinder; printf(" %d status 0x%02x\n", i, p->status); chs_unpack(&p->chs_first, &head, §or, &cylinder); printf(" %d s cylinder 0x%04x\n", i, cylinder); printf(" %d s sector 0x%01x\n", i, sector); printf(" %d s head 0x%01x\n", i, head); printf(" %d type 0x%01x\n", i, p->type); chs_unpack(&p->chs_last, &head, §or, &cylinder); printf(" %d e cylinder 0x%04x\n", i, cylinder); printf(" %d e sector 0x%01x\n", i, sector); printf(" %d e head 0x%01x\n", i, head); printf(" %d lba 0x%08x\n", i, p->lba); printf(" %d blocks 0x%08x\n", i, p->blocks); printf("\n"); } } static int dump_partitions(const char *image) { int fd, ret; struct partition_table pt; ret = 1; fd = -1; fd = open(image, O_RDONLY); if (fd == -1) goto out; if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) { errno = errno ? : EIO; goto out; } partition_table_in(&pt); if (partition_table_validate(&pt)) { errno = EINVAL; printf("table invalid\n"); goto out; } partition_table_dump(&pt); ret = 0; out: close(fd); return ret; } static void __dump_signature(struct partition_table *pt, int part) { if (part < 1 || part > 4) errno = EINVAL; else { uint8_t *p, *s; uint32_t sig = pt->disk_signature; uint64_t off = (uint64_t)pt->partitions[part - 1].lba << 9; sig = cpu_to_le32(sig); off = cpu_to_le64(off); for (p = s = (uint8_t *)&sig; p - s < sizeof(sig); p++) printf("%02x", *p); for (p = s = (uint8_t *)&off; p - s < sizeof(off); p++) printf("%02x", *p); printf("\n"); } } static int dump_signature(const char *image, int part) { int fd, ret; struct partition_table pt; ret = 1; fd = -1; fd = open(image, O_RDONLY); if (fd == -1) goto out; if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) { errno = errno ? : EIO; goto out; } partition_table_in(&pt); if (partition_table_validate(&pt)) { errno = EINVAL; printf("table invalid\n"); goto out; } __dump_signature(&pt, part); ret = 0; out: close(fd); return ret; } static int count_partitions(const char *image, int *count) { int i, fd, ret; struct partition_table pt; ret = 1; fd = -1; fd = open(image, O_RDONLY); if (fd == -1) goto out; if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) { errno = errno ? : EIO; goto out; } partition_table_in(&pt); if (partition_table_validate(&pt)) { *count = 0; goto done; } *count = 0; for (i = 0; i < 4; i++) if (pt.partitions[i].type) (*count)++; done: ret = 0; out: close(fd); return ret; } static int format_partition(const char *image, int type, struct partition_table *pt) { uint64_t lend; uint32_t start, end; int ret, sec_size, fd; unsigned int cylinders; struct hd_geometry geo; struct primary_partition *pp; struct partition_geometry pgeo; unsigned long long bytes, llcyls; ret = 1; fd = -1; memset(pt, 0, sizeof(*pt)); pp = pt->partitions; srandom(time(NULL)); fd = open(image, O_RDWR); if (fd == -1) goto out; if (ioctl(fd, HDIO_GETGEO, &geo)) goto out; if (ioctl(fd, BLKGETSIZE64, &bytes)) goto out; if (ioctl(fd, BLKSSZGET, &sec_size)) goto out; llcyls = (bytes >> 9) / ((sec_size >> 9) * geo.heads * geo.sectors); cylinders = llcyls; if (cylinders != llcyls) cylinders = ~0; pgeo.heads = geo.heads; pgeo.sectors = geo.sectors; pgeo.cylinders = cylinders; start = pgeo.sectors; lend = geo.heads * geo.sectors * llcyls - 1; end = lend; if (end != lend) end = ~0; pp->status = PARTITION_BOOTABLE; pp->type = type; pp->lba = start; pp->blocks = end - start + 1; pp->chs_first = lba_to_chs(&pgeo, start); pp->chs_last = lba_to_chs(&pgeo, lend); pt->mbr_signature = MBR_SIGNATURE; pt->disk_signature = random(); partition_table_out(pt); if (write(fd, pt, sizeof(*pt)) != sizeof(*pt)) { errno = errno ? : EIO; goto out; } ret = 0; out: close(fd); return ret; } int main(int argc, char *argv[]) { char *image; struct partition_table pt; int ret, c, type, count, dump, format, signature; ret = 1; format = 0; count = 0; dump = 0; type = 0; signature = -1; image = NULL; while ((c = getopt(argc, argv, "i:fdt:cs:h")) != -1) { switch (c) { case 'i': image = optarg; break; case 'c': count = 1; break; case 's': signature = atoi(optarg); break; case 'f': format = 1; break; case 't': { int base = (!strncasecmp(optarg, "0x", 2) ? 16 : 10); type = strtol(optarg, NULL, base); break; } case 'd': dump = 1; break; case 'h': usage(argv[0]); ret = 0; goto out; } } if (!image || (!format && !count && !signature && !dump)) { errno = EINVAL; usage(argv[0]); goto out; } if (format) { if (!type) { errno = EINVAL; perror("type required"); goto out; } if (format_partition(image, type, &pt)) { perror("formatting partition"); goto out; } __dump_signature(&pt, 1); } if (count) { if (count_partitions(image, &count)) { perror("counting partitions"); goto out; } printf("%d\n", count); } if (signature != -1) { if (dump_signature(image, signature)) { perror("dumping signature"); goto out; } } if (dump) { if (dump_partitions(image)) { perror("dumping partitions"); goto out; } } ret = 0; out: return ret; } blktap-2.0.90/part/Makefile.am0000644000000000000000000000040111664745551014570 0ustar rootroot AM_CFLAGS = -Wall AM_CFLAGS += -Werror AM_CPPFLAGS = -D_GNU_SOURCE AM_CPPFLAGS += -I../include sbin_PROGRAMS = part-util part_util_SOURCES = part-util.c part_util_SOURCES += partition.c part_util_SOURCES += partition.h dist_sbin_SCRIPTS = vhdpartx blktap-2.0.90/part/vhdpartx0000755000000000000000000000400511664745551014326 0ustar rootroot#!/bin/sh set -e PARTUTIL=/usr/sbin/part-util LIBVHDIO=/usr/lib/libvhdio.so.1.0 die() { echo "$@" exit 1 } usage() { echo "usage: $0 [-a | -d | -l] vhd [lib]" echo "-a add partition mappings" echo "-d del partition mappings" echo "-l list partition mappings" exit 1 } parse_args() { part_util=$PARTUTIL while [ $# -ge 1 ]; do case $1 in -a) add="TRUE" && count="1$count";; -d) del="TRUE" && count="1$count";; -l) list="TRUE" && count="1$count";; *) if [ -z "$vhd" ]; then vhd=$1; elif [ -z "$lib" ]; then lib=$1; else usage; fi;; esac shift done [[ -z "$lib" ]] && lib=$LIBVHDIO [[ -z "$vhd" || "$count" != "1" ]] && usage return 0 } # screen-scraping of fdisk... not used fdisk_read_partitions() { local data=$(LD_PRELOAD=$lib fdisk -l $vhd) local none=$(echo $data | grep "This doesn't look like a partition table") [[ -n "$none" ]] && partitions=0 && return 0 partitions=4 while [[ "$partitions" != "0" ]]; do local hit=$(echo $data | grep "${vhd}$partitions") [[ -n "$hit" ]] && break let partitions=$partitions-1 done } part_util_read_partitions() { partitions=$(LD_PRELOAD=$lib $part_util -c -i $vhd) } list_mappings() { local parts=1 while [[ $parts -le $partitions ]]; do echo ${vhd}$parts let parts=$parts+1 done } add_mappings() { local parts=1 local path=$(realpath $vhd) while [[ $parts -le $partitions ]]; do [[ -e ${path}${parts} ]] || ln -s $(basename $path) ${path}$parts let parts=$parts+1 done } del_mappings() { local parts=1 while [[ $parts -le $partitions ]]; do [[ -L ${vhd}$parts ]] && rm -f ${vhd}$parts let parts=$parts+1 done } main() { parse_args $@ [[ -x $part_util ]] || die "can't find part-util" [[ -r $vhd && -r $lib ]] || die "can't find vhd or lib" part_util_read_partitions [[ -n "$add" ]] && add_mappings [[ -n "$del" ]] && del_mappings [[ -n "$list" ]] && list_mappings return 0 } main $@ blktap-2.0.90/part/partition.h0000644000000000000000000000251411664745551014725 0ustar rootroot#ifndef _PARTITION_H_ #define _PARTITION_H_ #include #define PARTITION_BOOTABLE 0x80 #define PARTITION_NON_BOOTABLE 0x00 #define MBR_SIGNATURE 0xAA55 #define MBR_START_SECTOR 0x80 struct partition_geometry { unsigned char heads; unsigned char sectors; unsigned int cylinders; }; struct partition_chs { uint8_t chs[3]; } __attribute__((__packed__)); struct primary_partition { uint8_t status; struct partition_chs chs_first; uint8_t type; struct partition_chs chs_last; uint32_t lba; uint32_t blocks; } __attribute__((__packed__)); struct partition_table { uint8_t code[0x1b8]; uint32_t disk_signature; uint8_t pad[0x2]; struct primary_partition partitions[4]; uint16_t mbr_signature; } __attribute__((__packed__)); void partition_table_in(struct partition_table *); void partition_table_out(struct partition_table *); int partition_table_validate(struct partition_table *); void partition_table_dump(struct partition_table *); struct partition_chs lba_to_chs(struct partition_geometry *, uint64_t); #endif blktap-2.0.90/part/partition.c0000644000000000000000000000423611664745551014723 0ustar rootroot #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include "partition.h" #if BYTE_ORDER == LITTLE_ENDIAN #define le16_to_cpu(x) (x) #define le32_to_cpu(x) (x) #define cpu_to_le16(x) (x) #define cpu_to_le32(x) (x) #else #define le16_to_cpu(x) bswap_16(x) #define le32_to_cpu(x) bswap_32(x) #define cpu_to_le16(x) bswap_16(x) #define cpu_to_le32(x) bswap_32(x) #endif #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a)[0]) void primary_partition_in(struct primary_partition *p) { p->lba = le32_to_cpu(p->lba); p->blocks = le32_to_cpu(p->blocks); } void primary_partition_out(struct primary_partition *p) { p->lba = cpu_to_le32(p->lba); p->blocks = cpu_to_le32(p->blocks); } void partition_table_in(struct partition_table *pt) { int i; pt->disk_signature = le32_to_cpu(pt->disk_signature); pt->mbr_signature = le16_to_cpu(pt->mbr_signature); for (i = 0; i < ARRAY_SIZE(pt->partitions); i++) primary_partition_in(pt->partitions + i); } void partition_table_out(struct partition_table *pt) { int i; pt->disk_signature = cpu_to_le32(pt->disk_signature); pt->mbr_signature = cpu_to_le16(pt->mbr_signature); for (i = 0; i < ARRAY_SIZE(pt->partitions); i++) primary_partition_out(pt->partitions + i); } int primary_partition_validate(struct primary_partition *p) { if (p->status != PARTITION_BOOTABLE && p->status != PARTITION_NON_BOOTABLE) return EINVAL; return 0; } int partition_table_validate(struct partition_table *pt) { int i; if (pt->mbr_signature != MBR_SIGNATURE) return EINVAL; for (i = 0; i < ARRAY_SIZE(pt->partitions); i++) { int err = primary_partition_validate(pt->partitions + i); if (err) return err; } return 0; } struct partition_chs lba_to_chs(struct partition_geometry *geo, uint64_t lba) { struct partition_chs c; if (lba >= 0x3ff * geo->sectors * geo->heads) { c.chs[0] = geo->heads - 1; c.chs[1] = geo->sectors; lba = 0x3ff; } else { c.chs[1] = lba % geo->sectors + 1; lba /= geo->sectors; c.chs[0] = lba % geo->heads; lba /= geo->heads; } c.chs[2] = lba & 0xff; c.chs[1] |= (lba >> 2) & 0xc0; return c; } blktap-2.0.90/version.m40000644000000000000000000000006311664745551013521 0ustar rootrootm4_define([BLKTAP_VERSION], [m4_include(VERSION)]) blktap-2.0.90/drivers/0000755000000000000000000000000011664745551013251 5ustar rootrootblktap-2.0.90/drivers/tapdisk-log.c0000644000000000000000000001363711664745551015645 0ustar rootroot/* * Copyright (c) 2008, 2009, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include "tapdisk-log.h" #include "tapdisk-utils.h" #include "tapdisk-logfile.h" #include "tapdisk-syslog.h" #include "tapdisk-server.h" #define TLOG_LOGFILE_BUFSZ (16<<10) #define TLOG_SYSLOG_BUFSZ (8<<10) #define MAX_ENTRY_LEN 512 struct tlog { char *name; td_logfile_t logfile; int precious; int level; char *ident; td_syslog_t syslog; unsigned long errors; }; static struct tlog tapdisk_log; static void tlog_logfile_vprint(const char *fmt, va_list ap) { tapdisk_logfile_vprintf(&tapdisk_log.logfile, fmt, ap); } static void __printf(1, 2) tlog_logfile_print(const char *fmt, ...) { va_list ap; va_start(ap, fmt); tlog_logfile_vprint(fmt, ap); va_end(ap); } #define tlog_info(_fmt, _args ...) \ tlog_logfile_print("%s: "_fmt, tapdisk_log.ident, ##_args) static void tlog_logfile_save(void) { td_logfile_t *logfile = &tapdisk_log.logfile; const char *name = tapdisk_log.name; int err; tlog_info("saving log, %lu errors", tapdisk_log.errors); tapdisk_logfile_flush(logfile); err = tapdisk_logfile_rename(logfile, TLOG_DIR, name, ".log"); tlog_syslog(LOG_INFO, "logfile saved to %s: %d\n", logfile->path, err); } static void tlog_logfile_close(void) { td_logfile_t *logfile = &tapdisk_log.logfile; int keep; keep = tapdisk_log.precious || tapdisk_log.errors; tlog_info("closing log, %lu errors", tapdisk_log.errors); if (keep) tlog_logfile_save(); tapdisk_logfile_close(logfile); if (!keep) tapdisk_logfile_unlink(logfile); } static int tlog_logfile_open(const char *name, int level) { td_logfile_t *logfile = &tapdisk_log.logfile; int mode, err; err = mkdir(TLOG_DIR, 0755); if (err) { err = -errno; if (err != -EEXIST) goto fail; } err = tapdisk_logfile_open(logfile, TLOG_DIR, name, ".tmp", TLOG_LOGFILE_BUFSZ); if (err) goto fail; mode = (level == TLOG_DBG) ? _IOLBF : _IOFBF; err = tapdisk_logfile_setvbuf(logfile, mode); if (err) goto fail; tlog_info("log start, level %d", level); return 0; fail: tlog_logfile_close(); return err; } static void tlog_syslog_close(void) { td_syslog_t *syslog = &tapdisk_log.syslog; tapdisk_syslog_stats(syslog, LOG_INFO); tapdisk_syslog_flush(syslog); tapdisk_syslog_close(syslog); } static int tlog_syslog_open(const char *ident, int facility) { td_syslog_t *syslog = &tapdisk_log.syslog; int err; err = tapdisk_syslog_open(syslog, tapdisk_log.ident, facility, TLOG_SYSLOG_BUFSZ); return err; } void tlog_vsyslog(int prio, const char *fmt, va_list ap) { td_syslog_t *syslog = &tapdisk_log.syslog; tapdisk_vsyslog(syslog, prio, fmt, ap); } void tlog_syslog(int prio, const char *fmt, ...) { va_list ap; va_start(ap, fmt); tlog_vsyslog(prio, fmt, ap); va_end(ap); } int tlog_open(const char *name, int facility, int level) { int err; DPRINTF("tapdisk-log: started, level %d\n", level); tapdisk_log.level = level; tapdisk_log.name = strdup(name); tapdisk_log.ident = tapdisk_syslog_ident(name); if (!tapdisk_log.name || !tapdisk_log.ident) { err = -errno; goto fail; } err = tlog_logfile_open(tapdisk_log.name, level); if (err) goto fail; err = tlog_syslog_open(tapdisk_log.ident, facility); if (err) goto fail; return 0; fail: tlog_close(); return err; } void tlog_close(void) { DPRINTF("tapdisk-log: closing after %lu errors\n", tapdisk_log.errors); tlog_logfile_close(); tlog_syslog_close(); free(tapdisk_log.ident); tapdisk_log.ident = NULL; } void tlog_precious(void) { if (!tapdisk_log.precious) tlog_logfile_save(); tapdisk_log.precious = 1; } void __tlog_write(int level, const char *fmt, ...) { va_list ap; if (level <= tapdisk_log.level) { va_start(ap, fmt); tlog_logfile_vprint(fmt, ap); va_end(ap); } } void __tlog_error(const char *fmt, ...) { va_list ap; va_start(ap, fmt); tlog_vsyslog(LOG_ERR, fmt, ap); va_end(ap); tapdisk_log.errors++; } void tapdisk_start_logging(const char *ident, const char *_facility) { int facility; facility = tapdisk_syslog_facility(_facility); tapdisk_server_openlog(ident, LOG_CONS|LOG_ODELAY, facility); } void tapdisk_stop_logging(void) { tapdisk_server_closelog(); } blktap-2.0.90/drivers/tapdisk-filter.h0000644000000000000000000000173411664745551016351 0ustar rootroot/* Copyright (c) 2007, XenSource Inc. * All rights reserved. */ #ifndef TAPDISK_FILTER_H #define TAPDISK_FILTER_H #include #include #include #define TD_INJECT_FAULTS 0x00001 /* simulate random IO failures */ #define TD_CHECK_INTEGRITY 0x00002 /* check data integrity */ #define TD_FAULT_RATE 5 struct dhash { uint64_t hash; struct timeval time; }; struct fiocb { size_t bytes; void *data; }; struct tfilter { int mode; uint64_t secs; int iocbs; struct dhash *dhash; int ffree; struct fiocb *fiocbs; struct fiocb **flist; }; struct tfilter *tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs); void tapdisk_free_tfilter(struct tfilter *); void tapdisk_filter_iocbs(struct tfilter *, struct iocb **, int); void tapdisk_filter_events(struct tfilter *, struct io_event *, int); #endif blktap-2.0.90/drivers/lock.c0000644000000000000000000011047211664745551014352 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This module implements a "dot locking" style advisory file locking algorithm. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include "lock.h" #define unlikely(x) __builtin_expect(!!(x), 0) /* format: xenlk.hostname.uuid.*/ #define LF_POSTFIX ".xenlk" #define LFXL_FORMAT LF_POSTFIX ".%s.%s.x%s" #define LFFL_FORMAT LF_POSTFIX ".%s.%s.f%s" #define RETRY_MAX 16 #if defined(LOGS) #define LOG(format, args...) printf("%d: ", __LINE__); printf(format, ## args) #else #define LOG(format, args...) #endif /* random wait - up to .5 seconds */ #define XSLEEP usleep(random() & 0x7ffff) typedef int (*eval_func)(char *name, int readonly); static char *create_lockfn(char *fn_to_lock) { char *lockfn; /* allocate string to hold constructed lock file */ lockfn = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) + 1); if (unlikely(!lockfn)) { return 0; } /* append postfix to file to lock */ strcpy(lockfn, fn_to_lock); strcat(lockfn, LF_POSTFIX); return lockfn; } static char *create_lockfn_link(char *fn_to_lock, char *format, char *uuid, int readonly) { char hostname[128]; char *lockfn_link; char *ptr; /* get hostname */ if (unlikely(gethostname(hostname, sizeof(hostname)) == -1)) { return 0; } /* allocate string to hold constructed lock file link */ lockfn_link = malloc(strlen(fn_to_lock) + strlen(LF_POSTFIX) + strlen(hostname) + strlen(uuid) + 8); if (unlikely(!lockfn_link)) { return 0; } /* construct lock file link with specific format */ strcpy(lockfn_link, fn_to_lock); ptr = lockfn_link + strlen(lockfn_link); sprintf(ptr, format, hostname, uuid, readonly ? "r" : "w"); return lockfn_link; } static int NFSnormalizedStatTime(char *fn, struct stat *statnow, int *reterrno) { int result = LOCK_OK; int uniq; char *buf; int fd; int pid = (int)getpid(); int clstat; *reterrno = 0; /* create file to normalize time */ srandom((int)time(0) ^ pid); uniq = random() % 0xffffff; buf = malloc(strlen(fn) + 24); if (unlikely(!buf)) { result = LOCK_ENOMEM; goto finish; } strcpy(buf, fn); sprintf(buf + strlen(buf), ".xen%08d.tmp", uniq); fd = open(buf, O_WRONLY | O_CREAT, 0644); if (fd == -1) { *reterrno = errno; result = LOCK_EOPEN; goto finish; } clstat = close(fd); if (unlikely(clstat == -1)) { LOG("fail on close\n"); } if (lstat(buf, statnow) == -1) { unlink(buf); *reterrno = errno; result = LOCK_ESTAT; goto finish; } unlink(buf); finish: return result; } static int writer_eval(char *name, int readonly) { return name[strlen(name)-1] == 'w'; } static int reader_eval(char *name, int readonly) { return name[strlen(name)-1] == 'r' && !readonly; } static int lock_holder(char *fn, char *lockfn, char *lockfn_link, int force, int readonly, int *stole, eval_func eval, int *elt, int *ioerror) { int status = 0; int ustat; DIR *pd = 0; struct dirent *dptr; char *ptr; char *dirname = malloc(strlen(lockfn)); char *uname = malloc(strlen(lockfn_link) + 8); int elt_established = 0; int fd; char tmpbuf[4096]; *stole = 0; *ioerror = 0; *elt = 0; if (!dirname) goto finish; if (!uname) goto finish; /* get directory */ ptr = strrchr(lockfn, '/'); if (!ptr) { strcpy(dirname, "."); } else { int numbytes = ptr - lockfn; strncpy(dirname, lockfn, numbytes); dirname[numbytes] = '\0'; } pd = opendir(dirname); if (!pd) { *ioerror = errno ? errno : EIO; goto finish; } /* * scan through directory entries and use eval function * if we have a match (i.e. reader or writer lock) but * note that if we are forcing, we will remove any and * all locks that appear for target of our lock, regardless * if it a reader/writer owns the lock. */ errno = 0; dptr = readdir(pd); if (!dptr) { *ioerror = EIO; } while (dptr) { char *p1 = strrchr(fn, '/'); char *p2 = strrchr(lockfn, '/'); char *p3 = strrchr(lockfn_link, '/'); if (p1) p1+=1; if (p2) p2+=1; if (p3) p3+=1; if (strcmp(dptr->d_name, p1 ? p1 : fn) && strcmp(dptr->d_name, p2 ? p2 : lockfn) && strcmp(dptr->d_name, p3 ? p3 : lockfn_link) && !strncmp(dptr->d_name, p1 ? p1 : fn, strlen(p1?p1:fn))) { strcpy(uname, dirname); strcat(uname, "/"); strcat(uname, dptr->d_name); if (!elt_established) { /* read final lock file and extract lease time */ fd = open(uname, O_RDONLY, 0644); memset(tmpbuf, 0, sizeof(tmpbuf)); if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) { *ioerror = errno; status = 1; close(fd); goto finish; } close(fd); ptr = strrchr(tmpbuf, '.'); if (ptr) { *elt = atoi(ptr+1); elt_established = 1; } } if (force) { ustat = unlink(uname); if (ustat == -1) { LOG("failed to unlink %s\n", uname); } *stole = 1; *elt = 0; } else { if ((*eval)(dptr->d_name, readonly)) { closedir(pd); status = 1; goto finish; } } } dptr = readdir(pd); if (!dptr && !errno) { *ioerror = EIO; } } closedir(pd); finish: free(dirname); free(uname); /* if IO error, force a taken status */ return (*ioerror) ? 1 : status; } int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstatus) { char *lockfn = 0; char *lockfn_xlink = 0; char *lockfn_flink = 0; char *buf = 0; int fd; int status = 0; struct stat stat1, stat2; int retry_attempts = 0; int clstat; int tmpstat; int stealx = 0; int stealw = 0; int stealr = 0; int established_lease_time = 0; char tmpbuf[4096]; int ioerr; if (!fn_to_lock || !uuid) { *retstatus = LOCK_EBADPARM; return EINVAL; } *retstatus = 0; /* seed random with time/pid combo */ srandom((int)time(0) ^ getpid()); /* build lock file strings */ lockfn = create_lockfn(fn_to_lock); if (unlikely(!lockfn)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; } lockfn_xlink = create_lockfn_link(fn_to_lock, LFXL_FORMAT, uuid, readonly); if (unlikely(!lockfn_xlink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; } lockfn_flink = create_lockfn_link(fn_to_lock, LFFL_FORMAT, uuid, readonly); if (unlikely(!lockfn_flink)) { status = ENOMEM; *retstatus = LOCK_ENOMEM; goto finish; } try_again: if (retry_attempts++ > RETRY_MAX) { if (*retstatus == LOCK_EXLOCK_OPEN) { struct stat statnow, stat_exlock; int diff; if (lstat(lockfn, &stat_exlock) == -1) { goto finish; } if (NFSnormalizedStatTime(fn_to_lock, &statnow, &ioerr)) { goto finish; } diff = (int)statnow.st_mtime - (int)stat_exlock.st_mtime; if (diff > DEFAULT_LEASE_TIME_SECS) { unlink(lockfn); retry_attempts = 0; goto try_again; } } goto finish; } /* try to open exlusive lockfile */ fd = open(lockfn, O_WRONLY | O_CREAT | O_EXCL, 0644); if (fd == -1) { LOG("Initial lockfile creation failed %s force=%d, errno=%d\n", lockfn, force, errno); if (errno == EIO) { *retstatus = LOCK_EXLOCK_OPEN; status = EIO; goto finish; } /* already owned? (hostname & uuid match, skip time bits) */ errno = 0; fd = open(lockfn, O_RDWR, 0644); if (fd != -1) { buf = malloc(strlen(lockfn_xlink)+1); if (!buf) { clstat = close(fd); if (unlikely(clstat == -1)) { LOG("fail on close\n"); } *retstatus = LOCK_ENOMEM; status = ENOMEM; goto finish; } if (read(fd, buf, strlen(lockfn_xlink)) != (strlen(lockfn_xlink))) { clstat = close(fd); if (unlikely(clstat == -1)) { LOG("fail on close\n"); } free(buf); goto force_lock; } if (!strncmp(buf, lockfn_xlink, strlen(lockfn_xlink)-1)) { LOG("lock owned by us, reasserting\n"); /* our lock, reassert by rewriting below */ if (lseek(fd, 0, SEEK_SET) == -1) { clstat = close(fd); if (unlikely(clstat == -1)) { LOG("fail on close\n"); } goto force_lock; } free(buf); goto skip; } free(buf); clstat = close(fd); if (unlikely(clstat == -1)) { LOG("fail on close\n"); } } force_lock: if (errno == EIO) { *retstatus = LOCK_EXLOCK_OPEN; status = EIO; goto finish; } if (force) { /* remove lock file, we are forcing lock, try again */ status = unlink(lockfn); if (unlikely(status == -1)) { if (errno == EIO) { *retstatus = LOCK_EXLOCK_OPEN; status = EIO; goto finish; } LOG("force removal of %s lockfile failed, " "errno=%d, trying again\n", lockfn, errno); } stealx = 1; } XSLEEP; *retstatus = LOCK_EXLOCK_OPEN; goto try_again; } LOG("lockfile created %s\n", lockfn); skip: /* * write into the temporary xlock */ if (write(fd, lockfn_xlink, strlen(lockfn_xlink)) != strlen(lockfn_xlink)) { if (errno == EIO) { *retstatus = LOCK_EXLOCK_WRITE; status = EIO; goto finish; } status = errno; clstat = close(fd); if (unlikely(clstat == -1)) { LOG("fail on close\n"); } XSLEEP; *retstatus = LOCK_EXLOCK_WRITE; if (unlink(lockfn) == -1) { LOG("removal of %s lockfile failed, " "errno=%d, trying again\n", lockfn, errno); } goto try_again; } clstat = close(fd); if (unlikely(clstat == -1)) { LOG("fail on close\n"); } while (retry_attempts++ < RETRY_MAX) { tmpstat = link(lockfn, lockfn_xlink); LOG("linking %s and %s\n", lockfn, lockfn_xlink); if ((tmpstat == -1) && (errno != EEXIST)) { LOG("link status is %d, errno=%d\n", tmpstat, errno); } if ((lstat(lockfn, &stat1) == -1) || (lstat(lockfn_xlink, &stat2) == -1)) { /* try again, cleanup first */ tmpstat = unlink(lockfn); if (unlikely(tmpstat == -1)) { LOG("error removing lock file %s", lockfn); } tmpstat = unlink(lockfn_xlink); if (unlikely(tmpstat == -1)) { LOG("error removing linked lock file %s", lockfn_xlink); } XSLEEP; status = LOCK_ESTAT; goto finish; } /* compare inodes */ if (stat1.st_ino == stat2.st_ino) { /* success, inodes are the same */ /* should we check that st_nlink's are also 2?? */ *retstatus = LOCK_OK; status = 0; tmpstat = unlink(lockfn_xlink); if (unlikely(tmpstat == -1)) { LOG("error removing linked lock file %s", lockfn_xlink); } goto finish; } else { status = errno; /* try again, cleanup first */ tmpstat = unlink(lockfn); if (unlikely(tmpstat == -1)) { LOG("error removing lock file %s", lockfn); } tmpstat = unlink(lockfn_xlink); if (unlikely(tmpstat == -1)) { LOG("error removing linked lock file %s", lockfn_xlink); } XSLEEP; *retstatus = LOCK_EINODE; goto try_again; } } finish: if (!*retstatus) { /* we have exclusive lock */ status = 0; /* fast check, see if we own a final lock and are reasserting */ if (!lstat(lockfn_flink, &stat1)) { char *ptr; /* set the return value to notice this is a reassert */ *retstatus = 1; /* read existing lock file and extract established lease time */ fd = open(lockfn_flink, O_RDONLY, 0644); memset(tmpbuf, 0, sizeof(tmpbuf)); if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) { if (errno == EIO) { close(fd); *retstatus = LOCK_EINODE; status = EIO; goto skip_scan; } } close(fd); ptr = strrchr(tmpbuf, '.'); if (ptr) { *lease_time = atoi(ptr+1); } else { *lease_time = 10; /* wkchack */ } goto skip_scan; } else { if (errno == EIO) { *retstatus = LOCK_EINODE; status = EIO; goto skip_scan; } } /* we allow exclusive writer, or multiple readers */ if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force, readonly, &stealw, writer_eval, &established_lease_time, &ioerr)) { if (ioerr) { *retstatus = LOCK_EREAD; status = ioerr; goto skip_scan; } *retstatus = LOCK_EHELD_WR; } else if (lock_holder(fn_to_lock, lockfn, lockfn_flink, force, readonly, &stealr, reader_eval, &established_lease_time, &ioerr)) { if (ioerr) { *retstatus = LOCK_EREAD; status = ioerr; goto skip_scan; } *retstatus = LOCK_EHELD_RD; } if (established_lease_time) *lease_time = established_lease_time; } skip_scan: if (*retstatus >= 0) { /* update file, changes last modify time */ fd = open(lockfn_flink, O_WRONLY | O_CREAT, 0644); if (fd == -1) { *retstatus = LOCK_EOPEN; status = errno; } else { char tmpbuf[32]; int failed_write; memset(tmpbuf, 0, sizeof(tmpbuf)); sprintf(tmpbuf, ".%d", *lease_time); failed_write = write(fd, lockfn_flink, strlen(lockfn_flink)) != strlen(lockfn_flink); if (failed_write) status = errno; failed_write |= write(fd, tmpbuf, strlen(tmpbuf)) != strlen(tmpbuf); if (failed_write) status = errno; if (failed_write) { clstat = close(fd); if (unlikely(clstat == -1)) { LOG("fail on close\n"); } XSLEEP; *retstatus = LOCK_EUPDATE; goto try_again; } } clstat = close(fd); if (unlikely(clstat == -1)) { LOG("fail on close\n"); } } if (!*retstatus && force && (stealx || stealw || stealr)) { struct timeval timeout; /* enforce quiet time on steal */ timeout.tv_sec = *lease_time; timeout.tv_usec = 0; select(0, 0, 0, 0, &timeout); } /* remove exclusive lock, final read/write locks will hold */ tmpstat = unlink(lockfn); if (unlikely(tmpstat == -1)) { LOG("error removing exclusive lock file %s", lockfn); } free(lockfn); free(lockfn_xlink); free(lockfn_flink); /* set lease time to -1 if error, so no one is apt to use it */ if (*retstatus < 0) *lease_time = -1; LOG("returning status %d, errno=%d\n", status, errno); return status; } int unlock(char *fn_to_unlock, char *uuid, int readonly, int *status) { char *lockfn_link = 0; int reterrno = 0; if (!fn_to_unlock || !uuid) { *status = LOCK_EBADPARM; return 0; } lockfn_link = create_lockfn_link(fn_to_unlock, LFFL_FORMAT, uuid, readonly); if (unlikely(!lockfn_link)) { *status = LOCK_ENOMEM; goto finish; } if (unlink(lockfn_link) == -1) { LOG("error removing linked lock file %s", lockfn_link); reterrno = errno; *status = LOCK_ENOLOCK; goto finish; } *status = LOCK_OK; finish: free(lockfn_link); return reterrno; } int lock_delta(char *fn, int *ret_lease, int *max_lease) { int reterrno = 0; DIR *pd = 0; struct dirent *dptr; char *ptr; int result = INT_MAX; struct stat statbuf, statnow; char *dirname = malloc(strlen(fn)); char *uname = malloc(strlen(fn) + 8); int elt_established = 0; char *dotptr; char tmpbuf[4096]; int fd; if (!fn || !dirname || !uname) { *ret_lease = LOCK_EBADPARM; *max_lease = -1; return 0; } if (NFSnormalizedStatTime(fn, &statnow, &reterrno)) { result = LOCK_ESTAT; goto finish; } /* get directory */ ptr = strrchr(fn, '/'); if (!ptr) { strcpy(dirname, "."); ptr = fn; } else { int numbytes = ptr - fn; strncpy(dirname, fn, numbytes); ptr += 1; } pd = opendir(dirname); if (!pd) { reterrno = errno; goto finish; } dptr = readdir(pd); while (dptr) { if (strcmp(dptr->d_name, ptr) && !strncmp(dptr->d_name, ptr, strlen(ptr))) { char *fpath = malloc(strlen(dptr->d_name) + strlen(dirname) + 2); if (!fpath) { closedir(pd); result = LOCK_ENOMEM; goto finish; } strcpy(fpath, dirname); strcat(fpath, "/"); strcat(fpath, dptr->d_name); if (lstat(fpath, &statbuf) != -1) { int diff = (int)statnow.st_mtime - (int)statbuf.st_mtime; /* adjust diff if someone updated the lock between now and when we created the "now" file */ diff = (diff < 0) ? 0 : diff; result = diff < result ? diff : result; } else { closedir(pd); reterrno = errno; goto finish; } if (!elt_established) { /* read final lock file and extract lease time */ fd = open(fpath, O_RDONLY, 0644); memset(tmpbuf, 0, sizeof(tmpbuf)); if (read(fd, tmpbuf, sizeof(tmpbuf)) < 0) { /* error on read? */ } close(fd); dotptr = strrchr(tmpbuf, '.'); if (dotptr) { *max_lease = atoi(dotptr+1); elt_established = 1; } } free(fpath); } dptr = readdir(pd); } closedir(pd); finish: free(dirname); free(uname); /* returns smallest lock time, or error */ if (result == INT_MAX) result = LOCK_ENOLOCK; /* set lease time to -1 if error, so no one is apt to use it */ if ((result < 0) || reterrno) *max_lease = -1; *ret_lease = result; return reterrno; } #if defined(TEST) /* * the following is for sanity testing. */ static void usage(char *prg) { printf("usage %s\n" " dtr ]\n" " p [num iterations]\n" " u [0|1] []\n" " l [0|1] [0|1] [] []\n", prg); printf(" p : perf test lock take and reassert\n"); printf(" d : delta lock time\n"); printf(" t : test the file (after random locks)\n"); printf(" r : random lock tests (must ^C)\n"); printf(" u : unlock, readonly? uniqID (default is PID)\n"); printf(" l : lock, readonly? force?, uniqID (default is PID), lease time\n"); } static void test_file(char *fn) { FILE *fptr; int prev_count = 0; int count, pid, time; fptr = fopen(fn, "r"); if (!fptr) { LOG("ERROR on file %s open, errno=%d\n", fn, errno); return; } while (!feof(fptr)) { fscanf(fptr, "%d %d %d\n", &count, &pid, &time); if (prev_count != count) { LOG("ERROR: prev_count=%d, count=%d, pid=%d, time=%d\n", prev_count, count, pid, time); } prev_count = count + 1; } } static void random_locks(char *fn) { int pid = getpid(); int status; char *filebuf = malloc(256); int count = 0; int dummy; int clstat; char uuid[12]; int readonly; int lease = DEFAULT_LEASE_TIME_SECS; int err; /* this will never return, kill to exit */ srandom((int)time(0) ^ pid); LOG("pid: %d using file %s\n", pid, fn); sprintf(uuid, "%08d", pid); while (1) { XSLEEP; readonly = random() & 1; sysstatus = lock(fn, uuid, 0, readonly, &lease, status); if (status == LOCK_OK) { /* got lock, open, read, modify write close file */ int fd = open(fn, O_RDWR, 0644); if (fd == -1) { LOG("pid: %d ERROR on file %s open, errno=%d\n", pid, fn, errno); } else { if (!readonly) { /* ugly code to read data in test format */ /* format is "%d %d %d" 'count pid time' */ struct stat statbuf; int bytes; status = stat(fn, &statbuf); if (status != -1) { if (statbuf.st_size > 256) { lseek(fd, -256, SEEK_END); } memset(filebuf, 0, 256); bytes = read(fd, filebuf, 256); if (bytes) { int bw = bytes-2; while (bw && filebuf[bw]!='\n') bw--; if (!bw) bw = -1; sscanf(&filebuf[bw+1], "%d %d %d", &count, &dummy, &dummy); count += 1; } lseek(fd, 0, SEEK_END); sprintf(filebuf, "%d %d %d\n", count, pid, (int)time(0)); write(fd, filebuf, strlen(filebuf)); } else { LOG("pid: %d ERROR on file %s stat, " "errno=%d\n", pid, fn, errno); } } clstat = close(fd); if (unlikely(clstat == -1)) { LOG("fail on close\n"); } } XSLEEP; err = unlock(fn, uuid, readonly, &status); LOG("unlock status is %d (err=%d)\n", status, err); } } } static void perf_lock(char *fn, int loops) { int sysstatus; char buf[9]; int start = loops; int lease = DEFAULT_LEASE_TIME_SECS; sprintf(buf, "%08d", getpid()); while (loops--) { sysstatus = lock(fn, buf, 0, 0, &lease, &status); if (status < 0) { printf("failed to get lock at iteration %d errno=%d\n", start - loops, errno); return; } } unlock(fn, buf, 0, &status); } int main(int argc, char *argv[]) { int status; char *ptr; char uuid[12]; int force; int readonly; int max_lease, cur_lease; int intstatus; int lease = DEFAULT_LEASE_TIME_SECS; if (argc < 3) { usage(argv[0]); return 0; } sprintf(uuid, "%08d", getpid()); ptr = uuid; if (!strcmp(argv[1],"d")) { status = lock_delta(argv[2], &cur_lease, &max_lease); printf("lock delta for %s is %d seconds, max lease is %d\n", argv[2], cur_lease, max_lease); } else if (!strcmp(argv[1],"t")) { test_file(argv[2]); } else if (!strcmp(argv[1],"r")) { random_locks(argv[2]); } else if (!strcmp(argv[1],"p")) { perf_lock(argv[2], argc < 3 ? 100000 : atoi(argv[3])); } else if (!strcmp(argv[1],"l")) { if (argc < 4) force = 0; else force = atoi(argv[3]); if (argc < 5) readonly = 0; else readonly = atoi(argv[4]); if (argc >= 6) ptr = argv[5]; if (argc == 7) lease = atoi(argv[6]); status = lock(argv[2], ptr, readonly, force, &lease, &intstatus); printf("lock status = %d\n", status); } else if (!strcmp(argv[1],"u") ) { if (argc < 5) readonly = 0; else readonly = atoi(argv[3]); if (argc == 5) ptr = argv[4]; status = unlock(argv[2], ptr, readonly, &intstatus); printf("unlock status = %d\n", intstatus); } else { usage(argv[0]); } return status; } #elif defined(UTIL) /* * the following is used for non-libary, standalone * program utility as a shell program */ static void usage(char *prg) { printf("usage %s\n" " delta \n" " unlock \n" " lock <0|1> \n", prg); printf(" delta : get time since lock last refreshed\n"); printf(" returns delta time and max lease time in seconds\n"); printf(" unlock: unlock request filename, r|w, uniqID\n"); printf(" returns status (success is 0)\n"); printf(" lock : lock request filename, r|w, force?, uniqID, lease time request\n"); printf(" returns status (success is 0) and established lease time in seconds\n"); } int main(int argc, char *argv[]) { int status = 0; int dlock; char *ptr; int force; int readonly; int cur_lease, max_lease, intstatus; int lease = DEFAULT_LEASE_TIME_SECS; if (argc < 3) { if (argc == 2 && !strcmp(argv[1], "-h")) { usage(argv[0]); } else { printf("%d\n", LOCK_EUSAGE); } return 0; } if (!strcmp(argv[1],"delta") && (argc == 3)) { status = lock_delta(argv[2], &cur_lease, &max_lease); printf("%d %d\n", cur_lease, max_lease); } else if (!strcmp(argv[1],"lock") && (argc == 7)) { readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0; force = atoi(argv[4]); ptr = argv[5]; lease = atoi(argv[6]); status = lock(argv[2], ptr, force, readonly, &lease, &intstatus); printf("%d %d\n", intstatus, lease); } else if (!strcmp(argv[1],"unlock") && (argc == 5)) { readonly = (strcmp(argv[3], "r") == 0) ? 1 : 0; ptr = argv[4]; status = unlock(argv[2], ptr, readonly, &intstatus); printf("%d\n", intstatus); } else { printf("%d\n", LOCK_EUSAGE); } /* this is either 0 or a system defined errno */ return status; } #endif blktap-2.0.90/drivers/tapdisk-image.h0000644000000000000000000000660311664745551016146 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TAPDISK_IMAGE_H_ #define _TAPDISK_IMAGE_H_ #include "tapdisk.h" struct td_image_handle { int type; char *name; td_flag_t flags; td_driver_t *driver; td_disk_info_t info; struct list_head next; /* * Basic datapath statistics, in sectors read/written. * * hits: requests completed by this image. * fail: requests completed with failure by this image. * * Not that we do not count e.g. * miss: requests forwarded. * total: requests processed by this image. * * This is because we'd have to compensate for restarts due to * -EBUSY conditions. Those can be extrapolated by following * the chain instead: sum(image[i].hits, i=0..) == vbd.secs; */ struct { td_sector_count_t hits; td_sector_count_t fail; } stats; }; #define tapdisk_for_each_image(_image, _head) \ list_for_each_entry(_image, _head, next) #define tapdisk_for_each_image_safe(_image, _next, _head) \ list_for_each_entry_safe(_image, _next, _head, next) #define tapdisk_for_each_image_reverse(_image, _head) \ list_for_each_entry_reverse(_image, _head, next) #define tapdisk_image_entry(_head) \ list_entry(_head, td_image_t, next) int tapdisk_image_open(int, const char *, int, td_image_t **); void tapdisk_image_close(td_image_t *); int tapdisk_image_open_chain(const char *, int, int, struct list_head *); void tapdisk_image_close_chain(struct list_head *); int tapdisk_image_validate_chain(struct list_head *); td_image_t *tapdisk_image_allocate(const char *, int, td_flag_t); void tapdisk_image_free(td_image_t *); int tapdisk_image_check_td_request(td_image_t *, td_request_t); int tapdisk_image_check_request(td_image_t *, struct td_vbd_request *); void tapdisk_image_stats(td_image_t *, td_stats_t *); #endif blktap-2.0.90/drivers/td-rated.c0000644000000000000000000007324211664745551015131 0ustar rootroot/* * Copyright (c) 2011, Citrix Systems. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "block-valve.h" #include "compiler.h" #include "list.h" static void rlb_vlog_vfprintf(int prio, const char *fmt, va_list ap) { vfprintf(stderr, fmt, ap); fputc('\n', stderr); } static void (*rlb_vlog)(int prio, const char *fmt, va_list ap); __printf(2, 3) static void rlb_log(int prio, const char *fmt, ...) { va_list ap; va_start(ap, fmt); rlb_vlog(prio, fmt, ap); va_end(ap); } static int debug = 0; #define DBG(_l, _f, _a...) if (debug >= _l) { rlb_log(LOG_DEBUG, _f, ##_a); } #define INFO(_f, _a...) rlb_log(LOG_INFO, _f, ##_a) #define WARN(_f, _a...) rlb_log(LOG_WARNING, "WARNING: " _f ", in %s:%d", \ ##_a, __func__, __LINE__) #define ERR(_f, _a...) rlb_log(LOG_ERR, "ERROR: " _f ", in %s:%d", \ ##_a, __func__, __LINE__) #define PERROR(_f, _a...) rlb_log(LOG_ERR, _f ": %s in %s:%d", \ ##_a, strerror(errno), __func__, __LINE__) #define BUG() do { \ ERR("Aborting"); \ abort(); \ } while (0) #define BUG_ON(_cond) \ if (unlikely(_cond)) { \ ERR("(%s) = %d", #_cond, _cond); \ BUG(); \ } #define WARN_ON(_cond) ({ \ int __cond = _cond; \ if (unlikely(__cond)) \ WARN("(%s) = %d", #_cond, _cond); \ __cond; \ }) #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define ARRAY_SIZE(_a) (sizeof(_a)/sizeof((_a)[0])) typedef struct ratelimit_bridge td_rlb_t; typedef struct ratelimit_connection td_rlb_conn_t; struct ratelimit_connection { int sock; unsigned long need; /* I/O requested */ unsigned long gntd; /* I/O granted, pending */ struct list_head open; /* connected */ struct list_head wait; /* need > 0 */ struct { struct timeval since; struct timeval total; } wstat; }; #define RLB_CONN_MAX 1024 struct ratelimit_ops { void (*usage)(td_rlb_t *rlb, FILE *stream, void *data); int (*create)(td_rlb_t *rlb, int argc, char **argv, void **data); void (*destroy)(td_rlb_t *rlb, void *data); void (*info)(td_rlb_t *rlb, void *data); void (*settimeo)(td_rlb_t *rlb, struct timeval **tv, void *data); void (*timeout)(td_rlb_t *rlb, void *data); void (*dispatch)(td_rlb_t *rlb, void *data); void (*reset)(td_rlb_t *rlb, void *data); }; struct ratelimit_bridge { char *name; char *ident; struct sockaddr_un addr; char *path; int sock; struct list_head open; /* all connections */ struct list_head wait; /* all in need */ struct timeval ts, now; td_rlb_conn_t connv[RLB_CONN_MAX]; td_rlb_conn_t *free[RLB_CONN_MAX]; int n_free; struct rlb_valve { struct ratelimit_ops *ops; void *data; } valve; }; #define rlb_for_each_conn(_conn, _rlb) \ list_for_each_entry(_conn, &(_rlb)->open, open) #define rlb_for_each_conn_safe(_conn, _next, _rlb) \ list_for_each_entry_safe(_conn, _next, &(_rlb)->open, open) #define rlb_for_each_waiting(_conn, _next, _rlb) \ list_for_each_entry(_conn, _next, &(_rlb)->wait, wait) #define rlb_for_each_waiting_safe(_conn, _next, _rlb) \ list_for_each_entry_safe(_conn, _next, &(_rlb)->wait, wait) #define rlb_conn_entry(_list) \ list_entry(_list, td_rlb_conn_t, open) #define rlb_wait_entry(_list) \ list_entry(_list, td_rlb_conn_t, wait) static struct ratelimit_ops *rlb_find_valve(const char *name); static int rlb_create_valve(td_rlb_t *, struct rlb_valve *, const char *name, int argc, char **argv); /* * util */ #define case_G case 'G': case 'g' #define case_M case 'M': case 'm' #define case_K case 'K': case 'k' static long rlb_strtol(const char *s) { unsigned long l, u = 1; char *end, p, q; l = strtoul(s, &end, 0); if (!*end) return l; p = *end++; switch (p) { case_G: case_M: case_K: q = *end++; switch (q) { case 'i': switch (p) { case_G: u *= 1024; case_M: u *= 1024; case_K: u *= 1024; } break; case 0: switch (p) { case_G: u *= 1000; case_M: u *= 1000; case_K: u *= 1000; } break; default: goto fail; } break; case 0: break; default: goto fail; } return l * u; fail: return -EINVAL; } static char* vmprintf(const char *fmt, va_list ap) { char *s; int n; n = vasprintf(&s, fmt, ap); if (n < 0) s = NULL; return s; } __printf(1, 2) static char* mprintf(const char *fmt, ...) { va_list ap; char *s; va_start(ap, fmt); s = vmprintf(fmt, ap); va_end(ap); return s; } static int sysctl_vscanf(const char *name, const char *fmt, va_list ap) { char *path = NULL; FILE *s = NULL; int rv; path = mprintf("/proc/sys/%s", name); if (!path) { rv = -errno; goto fail; } s = fopen(path, "r"); if (!s) { rv = -errno; goto fail; } rv = vfscanf(s, fmt, ap); fail: if (s) fclose(s); if (path) free(path); return rv; } static int sysctl_scanf(const char *name, const char *fmt, ...) { va_list(ap); int rv; va_start(ap, fmt); rv = sysctl_vscanf(name, fmt, ap); va_end(ap); return rv; } static long sysctl_strtoul(const char *name) { unsigned val; int n; n = sysctl_scanf(name, "%lu", &val); if (n < 0) return n; if (n != 1) return -EINVAL; return val; } static long long rlb_tv_usec(const struct timeval *tv) { long long us; us = tv->tv_sec; us *= 1000000; us += tv->tv_usec; return us; } static long long rlb_usec_since(td_rlb_t *rlb, const struct timeval *since) { struct timeval delta; timersub(&rlb->now, since, &delta); return rlb_tv_usec(&delta); } static inline void rlb_argv_shift(int *optind, int *argc, char ***argv) { /* reset optind and args after '--' */ *optind -= 1; *argc -= *optind; *argv += *optind; *optind = 1; } /* * socket I/O */ static void rlb_sock_close(td_rlb_t *rlb) { if (rlb->path) { unlink(rlb->path); rlb->path = NULL; } if (rlb->sock >= 0) { close(rlb->sock); rlb->sock = -1; } } static int rlb_sock_open(td_rlb_t *rlb) { int s, err; rlb->sock = -1; s = socket(AF_UNIX, SOCK_STREAM, 0); if (s < 0) { PERROR("socket"); err = -errno; goto fail; } rlb->sock = s; rlb->addr.sun_family = AF_UNIX; if (rlb->name[0] == '/') strncpy(rlb->addr.sun_path, rlb->name, sizeof(rlb->addr.sun_path)); else snprintf(rlb->addr.sun_path, sizeof(rlb->addr.sun_path), "%s/%s", TD_VALVE_SOCKDIR, rlb->name); err = bind(rlb->sock, &rlb->addr, sizeof(rlb->addr)); if (err) { PERROR("%s", rlb->addr.sun_path); err = -errno; goto fail; } rlb->path = rlb->addr.sun_path; err = listen(rlb->sock, RLB_CONN_MAX); if (err) { PERROR("listen(%s)", rlb->addr.sun_path); err = -errno; goto fail; } return 0; fail: rlb_sock_close(rlb); return err; } static int rlb_sock_send(td_rlb_t *rlb, td_rlb_conn_t *conn, const void *msg, size_t size) { ssize_t n; n = send(conn->sock, msg, size, MSG_DONTWAIT); if (n < 0) return -errno; if (n && n != size) return -EPROTO; return 0; } static int rlb_sock_recv(td_rlb_t *rlb, td_rlb_conn_t *conn, void *msg, size_t size) { ssize_t n; n = recv(conn->sock, msg, size, MSG_DONTWAIT); if (n < 0) return -errno; return n; } static td_rlb_conn_t * rlb_conn_alloc(td_rlb_t *rlb) { td_rlb_conn_t *conn = NULL; if (likely(rlb->n_free > 0)) conn = rlb->free[--rlb->n_free]; return conn; } static void rlb_conn_free(td_rlb_t *rlb, td_rlb_conn_t *conn) { BUG_ON(rlb->n_free >= RLB_CONN_MAX); rlb->free[rlb->n_free++] = conn; } static int rlb_conn_id(td_rlb_t *rlb, td_rlb_conn_t *conn) { return conn - rlb->connv; } static void rlb_conn_info(td_rlb_t *rlb, td_rlb_conn_t *conn) { long long wtime; int waits; wtime = 0; waits = !list_empty(&conn->wait); if (waits) wtime = rlb_usec_since(rlb, &conn->wstat.since) / 1000; WARN_ON(!!conn->need != waits); INFO("conn[%d] needs %lu (since %llu ms, total %lu.%06lu s)," " %lu granted", rlb_conn_id(rlb, conn), conn->need, wtime, conn->wstat.total.tv_sec, conn->wstat.total.tv_usec, conn->gntd); } static void rlb_conn_infos(td_rlb_t *rlb) { td_rlb_conn_t *conn; rlb_for_each_conn(conn, rlb) rlb_conn_info(rlb, conn); } static void rlb_conn_close(td_rlb_t *rlb, td_rlb_conn_t *conn) { int s = conn->sock; INFO("Connection %d closed.", rlb_conn_id(rlb, conn)); rlb_conn_info(rlb, conn); if (s) { close(s); conn->sock = -1; } list_del_init(&conn->wait); list_del(&conn->open); rlb_conn_free(rlb, conn); } static void rlb_conn_receive(td_rlb_t *rlb, td_rlb_conn_t *conn) { struct td_valve_req buf[32], req = { -1, -1 }; ssize_t n; int i, err; n = rlb_sock_recv(rlb, conn, buf, sizeof(buf)); if (!n) goto close; if (n < 0) { err = n; if (err != -EAGAIN) goto fail; } if (unlikely(n % sizeof(req))) { err = -EPROTO; goto fail; } for (i = 0; i < n / sizeof(buf[0]); i++) { req = buf[i]; if (unlikely(req.need > TD_RLB_REQUEST_MAX)) { err = -EINVAL; goto fail; } if (unlikely(req.done > conn->gntd)) { err = -EINVAL; goto fail; } conn->need += req.need; conn->gntd -= req.done; DBG(8, "rcv: %lu/%lu need=%lu gntd=%lu", req.need, req.done, conn->need, conn->gntd); if (unlikely(conn->need > TD_RLB_REQUEST_MAX)) { err = -EINVAL; goto fail; } } if (conn->need && list_empty(&conn->wait)) { list_add_tail(&conn->wait, &rlb->wait); conn->wstat.since = rlb->now; } return; fail: WARN("err = %d (%s)" " (need %ld/%ld, %ld/%ld done)," " closing connection.", err, strerror(-err), req.need, conn->need, req.done, conn->gntd); rlb_conn_info(rlb, conn); close: rlb_conn_close(rlb, conn); } static void rlb_conn_respond(td_rlb_t *rlb, td_rlb_conn_t *conn, unsigned long need) { int err; BUG_ON(need > conn->need); err = rlb_sock_send(rlb, conn, &need, sizeof(need)); if (err) goto fail; conn->need -= need; conn->gntd += need; DBG(8, "snd: %lu need=%lu gntd=%lu", need, conn->need, conn->gntd); if (!conn->need) { struct timeval delta; timersub(&rlb->now, &conn->wstat.since, &delta); timeradd(&conn->wstat.total, &delta, &conn->wstat.total); list_del_init(&conn->wait); } return; fail: WARN("err = %d, killing connection.", err); rlb_conn_close(rlb, conn); } static void rlb_accept_conn(td_rlb_t *rlb) { td_rlb_conn_t *conn; int s, err; s = accept(rlb->sock, NULL, NULL); if (!s) { err = -errno; goto fail; } conn = rlb_conn_alloc(rlb); if (!conn) { err = -ENOMEM; close(s); goto fail; } INFO("Accepting connection %td.", conn - rlb->connv); memset(conn, 0, sizeof(*conn)); INIT_LIST_HEAD(&conn->wait); conn->sock = s; list_add_tail(&conn->open, &rlb->open); return; fail: WARN("err = %d", err); } static long long rlb_pending(td_rlb_t *rlb) { td_rlb_conn_t *conn; long long pend = 0; rlb_for_each_conn(conn, rlb) pend += conn->gntd; return pend; } /* * token bucket valve */ typedef struct ratelimit_token td_rlb_token_t; struct ratelimit_token { long cred; long cap; long rate; struct timeval timeo; }; static void rlb_token_settimeo(td_rlb_t *rlb, struct timeval **_tv, void *data) { td_rlb_token_t *token = data; struct timeval *tv = &token->timeo; long long us; if (list_empty(&rlb->wait)) { *_tv = NULL; return; } WARN_ON(token->cred >= 0); us = -token->cred; us *= 1000000; us /= token->rate; tv->tv_sec = us / 1000000; tv->tv_usec = us % 1000000; WARN_ON(!timerisset(tv)); *_tv = tv; } static void rlb_token_refill(td_rlb_t *rlb, td_rlb_token_t *token) { struct timeval tv; long long cred, max_usec; /* max time needed to refill up to cap */ max_usec = token->cap - token->cred; max_usec *= 1000000; max_usec += token->rate - 1; max_usec /= token->rate; /* actual credit gained */ timersub(&rlb->now, &rlb->ts, &tv); cred = rlb_tv_usec(&tv); cred = MIN(cred, max_usec); cred *= token->rate; cred /= 1000000; /* up to cap */ token->cred += cred; token->cred = MIN(token->cred, token->cap); } static void rlb_token_dispatch(td_rlb_t *rlb, void *data) { td_rlb_token_t *token = data; td_rlb_conn_t *conn, *next; rlb_token_refill(rlb, token); rlb_for_each_waiting_safe(conn, next, rlb) { if (token->cred < 0) break; token->cred -= conn->need; rlb_conn_respond(rlb, conn, conn->need); } } static void rlb_token_reset(td_rlb_t *rlb, void *data) { td_rlb_token_t *token = data; token->cred = token->cap; } static void rlb_token_destroy(td_rlb_t *rlb, void *data) { td_rlb_token_t *token = data; if (token) free(token); } static int rlb_token_create(td_rlb_t *rlb, int argc, char **argv, void **data) { td_rlb_token_t *token; int err; token = calloc(1, sizeof(*token)); if (!token) { err = -ENOMEM; goto fail; } token->rate = 0; token->cap = 0; do { const struct option longopts[] = { { "rate", 1, NULL, 'r' }, { "cap", 1, NULL, 'c' }, { NULL, 0, NULL, 0 } }; int c; c = getopt_long(argc, argv, "r:c:", longopts, NULL); if (c < 0) break; switch (c) { case 'r': token->rate = rlb_strtol(optarg); if (token->rate < 0) { ERR("invalid --rate"); goto usage; } break; case 'c': token->cap = rlb_strtol(optarg); if (token->cap < 0) { ERR("invalid --cap"); goto usage; } break; case '?': goto usage; default: BUG(); } } while (1); if (!token->rate) { ERR("--rate required"); goto usage; } rlb_token_reset(rlb, token); *data = token; return 0; fail: if (token) free(token); return err; usage: err = -EINVAL; goto fail; } static void rlb_token_usage(td_rlb_t *rlb, FILE *stream, void *data) { fprintf(stream, " {-t|--type}=token --" " {-r|--rate}=" " {-c|--cap}="); } static void rlb_token_info(td_rlb_t *rlb, void *data) { td_rlb_token_t *token = data; INFO("TOKEN: rate: %ld B/s cap: %ld B cred: %ld B", token->rate, token->cap, token->cred); } static struct ratelimit_ops rlb_token_ops = { .usage = rlb_token_usage, .create = rlb_token_create, .destroy = rlb_token_destroy, .info = rlb_token_info, .settimeo = rlb_token_settimeo, .timeout = rlb_token_dispatch, .dispatch = rlb_token_dispatch, .reset = rlb_token_reset, }; /* * meminfo valve */ typedef struct ratelimit_meminfo td_rlb_meminfo_t; struct ratelimit_meminfo { unsigned int period; struct timeval ts; FILE *s; unsigned long total; unsigned long dirty; unsigned long writeback; unsigned int limit_hi; unsigned int limit_lo; unsigned int congested; struct rlb_valve valve; struct timeval timeo; }; static void rlb_meminfo_info(td_rlb_t *rlb, void *data) { td_rlb_meminfo_t *m = data; INFO("MEMINFO: lo/hi: %u/%u%% period: %u ms", m->limit_lo, m->limit_hi, m->period); INFO("MEMINFO: total %lu kB, dirty/writeback %lu/%lu kB", m->total, m->dirty, m->writeback); m->valve.ops->info(rlb, m->valve.data); } static void rlb_meminfo_close(td_rlb_meminfo_t *m) { if (m->s) { fclose(m->s); m->s = NULL; } } static int rlb_meminfo_open(td_rlb_meminfo_t *m) { FILE *s; int err; m->s = NULL; s = fopen("/proc/meminfo", "r"); if (!s) { err = -errno; goto fail; } m->s = s; return 0; fail: rlb_meminfo_close(m); return err; } static inline int __test_bit(int n, unsigned long *bitmap) { return !!(*bitmap & (1UL<s); if (!b) break; for (i = 0; i < n_keys; i++) { struct ratelimit_meminfo_scan *scan; unsigned long val, *ptr; int n; if (!__test_bit(i, &pending)) continue; scan = &rlb_meminfo_scanfs[i]; n = sscanf(buf, scan->format, &val); if (n != 1) continue; ptr = (void*)m + scan->ptrdiff; *ptr = val; __clear_bit(i, &pending); } } while (pending); if (pending) { err = -ESRCH; goto fail; } err = 0; fail: rlb_meminfo_close(m); return err; } static void rlb_meminfo_usage(td_rlb_t *rlb, FILE *stream, void *data) { td_rlb_meminfo_t *m = data; fprintf(stream, " {-t|--type}=meminfo " " {-H|--high}= {-L|--low}=" " {-p|--period}= --"); if (m && m->valve.ops) { m->valve.ops->usage(rlb, stream, m->valve.data); } else fprintf(stream, " {-t|--type}={...}"); } static void rlb_meminfo_destroy(td_rlb_t *rlb, void *data) { td_rlb_meminfo_t *m = data; if (m) { if (m->valve.data) { m->valve.ops->destroy(rlb, m->valve.data); m->valve.data = NULL; } free(m); } } static int rlb_meminfo_create(td_rlb_t *rlb, int argc, char **argv, void **data) { td_rlb_meminfo_t *m; const char *type; long dbr; int err; m = calloc(1, sizeof(*m)); if (!m) { PERROR("calloc"); err = -errno; goto fail; } type = NULL; m->period = 100; do { const struct option longopts[] = { { "period", 1, NULL, 'p' }, { "type", 1, NULL, 't' }, { "high", 1, NULL, 'H' }, { "low", 1, NULL, 'L' }, { NULL, 0, NULL, 0 } }; int c; c = getopt_long(argc, argv, "p:t:H:L:", longopts, NULL); if (c < 0) break; switch (c) { case 'p': m->period = rlb_strtol(optarg); if (m->period < 0) goto usage; break; case 'H': m->limit_hi = strtoul(optarg, NULL, 0); break; case 'L': m->limit_lo = strtoul(optarg, NULL, 0); break; case 't': type = optarg; break; case '?': goto usage; default: BUG(); } } while (1); if (!m->limit_hi || !m->limit_lo) { ERR("--high/--low required"); goto usage; } if (m->limit_lo >= m->limit_hi) { ERR("invalid --high/--low ratio"); goto usage; } if (!type) { ERR("(sub) --type required"); goto usage; } dbr = sysctl_strtoul("vm/dirty_background_ratio"); if (dbr < 0) { err = dbr; ERR("vm/dirty_background_ratio: %d", err); goto fail; } if (0 && m->limit_lo < dbr) { ERR("--low %u is less than vm.dirty_background_ratio (= %ld)", m->limit_lo, dbr); err = -EINVAL; goto fail; } *data = m; rlb_argv_shift(&optind, &argc, &argv); err = rlb_create_valve(rlb, &m->valve, type, argc, argv); if (err) { if (err == -EINVAL) goto usage; goto fail; } err = rlb_meminfo_scan(m); if (err) { PERROR("/proc/meminfo"); goto fail; } return 0; fail: ERR("err = %d", err); return err; usage: err = -EINVAL; return err; }; static void rlb_meminfo_settimeo(td_rlb_t *rlb, struct timeval **_tv, void *data) { td_rlb_meminfo_t *m = data; int idle; idle = list_empty(&rlb->wait); BUG_ON(!idle && !m->congested); if (m->congested) { m->valve.ops->settimeo(rlb, _tv, m->valve.data); return; } *_tv = NULL; } static void rlb_meminfo_timeout(td_rlb_t *rlb, void *data) { td_rlb_meminfo_t *m = data; WARN_ON(!m->congested); if (m->congested) m->valve.ops->timeout(rlb, m->valve.data); } static int rlb_meminfo_test_high(td_rlb_t *rlb, td_rlb_meminfo_t *m, long long cred) { long long lo; if (m->congested) { /* hysteresis */ lo = m->total; lo *= m->limit_lo; lo /= 100; if (cred >= lo) return 0; } else if (cred <= 0) { m->valve.ops->reset(rlb, m->valve.data); return 1; } return m->congested; } static void rlb_meminfo_dispatch_low(td_rlb_t *rlb, td_rlb_meminfo_t *m, long long *_cred) { td_rlb_conn_t *conn, *next; long long cred = *_cred, grant; rlb_for_each_waiting_safe(conn, next, rlb) { if (cred <= 0) break; grant = MIN(cred, conn->need); rlb_conn_respond(rlb, conn, grant); cred -= grant; } *_cred = cred; } static void rlb_meminfo_dispatch(td_rlb_t *rlb, void *data) { td_rlb_meminfo_t *m = data; long long us, hi, cred, dirty, pend; /* we run only once per m->period */ us = rlb_usec_since(rlb, &m->ts); if (us / 1000 > m->period) { rlb_meminfo_scan(m); m->ts = rlb->now; } /* uncongested credit: memory below hi watermark minus pending I/O */ hi = m->total; hi *= m->limit_hi; hi /= 100; dirty = m->dirty + m->writeback; cred = hi - dirty; cred *= 1000; pend = rlb_pending(rlb); cred -= pend; m->congested = rlb_meminfo_test_high(rlb, m, cred); DBG(3, "dirty=%lld (%lld) pend=%llu cred=%lld %s", dirty, dirty * 100 / m->total, pend, cred, m->congested ? "congested" : ""); if (!m->congested) { rlb_meminfo_dispatch_low(rlb, m, &cred); m->congested = rlb_meminfo_test_high(rlb, m, cred); } if (m->congested) m->valve.ops->dispatch(rlb, m->valve.data); } static struct ratelimit_ops rlb_meminfo_ops = { .usage = rlb_meminfo_usage, .create = rlb_meminfo_create, .destroy = rlb_meminfo_destroy, .info = rlb_meminfo_info, .settimeo = rlb_meminfo_settimeo, .timeout = rlb_meminfo_timeout, .dispatch = rlb_meminfo_dispatch, }; /* * main loop */ static void rlb_info(td_rlb_t *rlb) { rlb->valve.ops->info(rlb, rlb->valve.data); rlb_conn_infos(rlb); } static sigset_t rlb_sigunblock; static sigset_t rlb_sigpending; static void rlb_sigmark(int signo) { INFO("Caught SIG%d", signo); sigaddset(&rlb_sigpending, signo); } static int rlb_siginit(void) { struct sigaction sa_ignore = { .sa_handler = SIG_IGN }; struct sigaction sa_pending = { .sa_handler = rlb_sigmark }; sigset_t sigmask; int err = 0; if (!err) err = sigaction(SIGPIPE, &sa_ignore, NULL); if (!err) err = sigaction(SIGINT, &sa_pending, NULL); if (!err) err = sigaction(SIGTERM, &sa_pending, NULL); if (!err) err = sigaction(SIGUSR1, &sa_pending, NULL); if (err) { err = -errno; goto fail; } sigemptyset(&sigmask); sigaddset(&sigmask, SIGINT); sigaddset(&sigmask, SIGTERM); sigaddset(&sigmask, SIGUSR1); err = sigprocmask(SIG_BLOCK, &sigmask, &rlb_sigunblock); if (err) { err = -errno; goto fail; } fail: return err; } static int rlb_main_signaled(td_rlb_t *rlb) { if (sigismember(&rlb_sigpending, SIGUSR1)) rlb_info(rlb); if (sigismember(&rlb_sigpending, SIGINT) || sigismember(&rlb_sigpending, SIGTERM)) return -EINTR; return 0; } static struct ratelimit_ops * rlb_find_valve(const char *name) { struct ratelimit_ops *ops = NULL; switch (name[0]) { #if 0 case 'l': if (!strcmp(name, "leaky")) ops = &rlb_leaky_ops; break; #endif case 't': if (!strcmp(name, "token")) ops = &rlb_token_ops; break; case 'm': if (!strcmp(name, "meminfo")) ops = &rlb_meminfo_ops; break; } return ops; } static int rlb_main_iterate(td_rlb_t *rlb) { td_rlb_conn_t *conn, *next; struct timeval *tv; struct timespec _ts, *ts = &_ts; int nfds, err; fd_set rfds; FD_ZERO(&rfds); nfds = 0; if (stdin) { FD_SET(STDIN_FILENO, &rfds); nfds = MAX(nfds, STDIN_FILENO); } if (rlb->sock >= 0) { FD_SET(rlb->sock, &rfds); nfds = MAX(nfds, rlb->sock); } rlb_for_each_conn(conn, rlb) { FD_SET(conn->sock, &rfds); nfds = MAX(nfds, conn->sock); } rlb->valve.ops->settimeo(rlb, &tv, rlb->valve.data); if (tv) { TIMEVAL_TO_TIMESPEC(tv, ts); } else ts = NULL; rlb->ts = rlb->now; nfds = pselect(nfds + 1, &rfds, NULL, NULL, ts, &rlb_sigunblock); if (nfds < 0) { err = -errno; if (err != -EINTR) PERROR("select"); goto fail; } gettimeofday(&rlb->now, NULL); if (!nfds) { BUG_ON(!ts); rlb->valve.ops->timeout(rlb, rlb->valve.data); } if (nfds) { rlb_for_each_conn_safe(conn, next, rlb) if (FD_ISSET(conn->sock, &rfds)) { rlb_conn_receive(rlb, conn); if (!--nfds) break; } rlb->valve.ops->dispatch(rlb, rlb->valve.data); } if (unlikely(nfds)) { if (FD_ISSET(STDIN_FILENO, &rfds)) { getc(stdin); rlb_info(rlb); nfds--; } } if (unlikely(nfds)) { if (FD_ISSET(rlb->sock, &rfds)) { rlb_accept_conn(rlb); nfds--; } } BUG_ON(nfds); err = 0; fail: return err; } static int rlb_main_run(td_rlb_t *rlb) { int err; do { err = rlb_main_iterate(rlb); if (err) { if (err != -EINTR) break; err = rlb_main_signaled(rlb); if (err) { err = 0; break; } } } while (rlb->sock >= 0 || !list_empty(&rlb->open)); return err; } static void rlb_shutdown(td_rlb_t *rlb) { td_rlb_conn_t *conn, *next; rlb_for_each_conn_safe(conn, next, rlb) rlb_conn_close(rlb, conn); rlb_sock_close(rlb); } static void rlb_usage(td_rlb_t *rlb, const char *prog, FILE *stream) { fprintf(stream, "Usage: %s ", prog); if (rlb && rlb->valve.ops) rlb->valve.ops->usage(rlb, stream, rlb->valve.data); else fprintf(stream, " {-t|--type}={token|meminfo}" " [-h|--help] [-D|--debug=]"); fprintf(stream, "\n"); } static void rlb_destroy(td_rlb_t *rlb) { rlb_shutdown(rlb); if (rlb->valve.data) { rlb->valve.ops->destroy(rlb, rlb->valve.data); rlb->valve.data = NULL; } if (rlb->name) { free(rlb->name); rlb->name = NULL; } } static int rlb_create(td_rlb_t *rlb, const char *name) { int i, err; memset(rlb, 0, sizeof(*rlb)); INIT_LIST_HEAD(&rlb->open); INIT_LIST_HEAD(&rlb->wait); rlb->sock = -1; for (i = RLB_CONN_MAX - 1; i >= 0; i--) rlb_conn_free(rlb, &rlb->connv[i]); rlb->name = strdup(name); if (!rlb->name) { err = -errno; goto fail; } err = rlb_sock_open(rlb); if (err) goto fail; gettimeofday(&rlb->now, NULL); return 0; fail: WARN("err = %d", err); rlb_destroy(rlb); return err; } static int rlb_create_valve(td_rlb_t *rlb, struct rlb_valve *v, const char *name, int argc, char **argv) { struct ratelimit_ops *ops; int err; ops = rlb_find_valve(name); if (!ops) { ERR("No such driver: %s", name); err = -ESRCH; goto fail; } v->ops = ops; err = v->ops->create(rlb, argc, argv, &v->data); fail: return err; } static void rlb_openlog(const char *name, int facility) { static char ident[32]; snprintf(ident, sizeof(ident), "%s[%d]", name, getpid()); ident[sizeof(ident)-1] = 0; openlog(ident, 0, facility); rlb_vlog = vsyslog; } int main(int argc, char **argv) { td_rlb_t _rlb, *rlb; const char *prog, *type; int err; setbuf(stdin, NULL); setlinebuf(stderr); rlb = NULL; prog = basename(argv[0]); type = NULL; rlb_vlog = rlb_vlog_vfprintf; do { const struct option longopts[] = { { "help", 0, NULL, 'h' }, { "type", 1, NULL, 't' }, { "debug", 0, NULL, 'D' }, { NULL, 0, NULL, 0 }, }; int c; c = getopt_long(argc, argv, "ht:D:", longopts, NULL); if (c < 0) break; switch (c) { case 'h': rlb_usage(NULL, prog, stdout); return 0; case 't': type = optarg; break; case 'D': debug = strtoul(optarg, NULL, 0); break; case '?': goto usage; default: BUG(); } } while (1); if (!type) goto usage; if (argc - optind < 1) goto usage; err = rlb_siginit(); if (err) goto fail; err = rlb_create(&_rlb, argv[optind++]); if (err) goto fail; rlb = &_rlb; rlb_argv_shift(&optind, &argc, &argv); err = rlb_create_valve(rlb, &rlb->valve, type, argc, argv); if (err) { if (err == -EINVAL) goto usage; goto fail; } if (!debug) { err = daemon(0, 0); if (err) goto fail; stdin = stdout = stderr = NULL; rlb_openlog(prog, LOG_DAEMON); } INFO("TD ratelimit bridge: %s, pid %d", rlb->path, getpid()); rlb_info(rlb); err = rlb_main_run(rlb); if (err) INFO("Exiting with status %d", -err); fail: if (rlb) rlb_destroy(rlb); return -err; usage: rlb_usage(rlb, prog, stderr); err = -EINVAL; goto fail; } blktap-2.0.90/drivers/td.c0000644000000000000000000003455111664745551014034 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include "libvhd.h" #include "vhd-util.h" #include "tapdisk-utils.h" #if 1 #define DFPRINTF(_f, _a...) fprintf ( stdout, _f , ## _a ) #else #define DFPRINTF(_f, _a...) ((void)0) #endif typedef enum { TD_FIELD_HIDDEN = 0, TD_FIELD_INVALID = 1 } td_field_t; struct vdi_field { char *name; td_field_t id; }; static struct vdi_field td_vdi_fields[TD_FIELD_INVALID] = { { .id = TD_FIELD_HIDDEN, .name = "hidden" } }; typedef enum { TD_CMD_CREATE = 0, TD_CMD_SNAPSHOT, /* TD_CMD_COALESCE, */ TD_CMD_QUERY, /* TD_CMD_RESIZE, */ TD_CMD_SET, /* TD_CMD_REPAIR, */ /* TD_CMD_FILL, */ /* TD_CMD_READ, */ TD_CMD_INVALID, } td_command_t; struct command { td_command_t id; char *name; int needs_type; }; struct command commands[TD_CMD_INVALID] = { { .id = TD_CMD_CREATE, .name = "create", .needs_type = 1 }, { .id = TD_CMD_SNAPSHOT, .name = "snapshot", .needs_type = 1 }, /* { .id = TD_CMD_COALESCE, .name = "coalesce", .needs_type = 1 }, */ { .id = TD_CMD_QUERY, .name = "query", .needs_type = 1 }, /* { .id = TD_CMD_RESIZE, .name = "resize", .needs_type = 1 }, */ { .id = TD_CMD_SET, .name = "set", .needs_type = 1 }, /* { .id = TD_CMD_REPAIR, .name = "repair", .needs_type = 1 }, */ /* { .id = TD_CMD_FILL, .name = "fill", .needs_type = 1 }, */ /* { .id = TD_CMD_READ, .name = "read", .needs_type = 1 }, */ }; typedef enum { TD_TYPE_VHD = 0, TD_TYPE_AIO, TD_TYPE_INVALID, } td_disk_t; const char *td_disk_types[TD_TYPE_INVALID] = { "vhd", "aio", }; #define print_commands() \ do { \ int i; \ fprintf(stderr, "COMMAND := { "); \ fprintf(stderr, "%s", commands[0].name); \ for (i = 1; i < TD_CMD_INVALID; i++) \ fprintf(stderr, " | %s", commands[i].name); \ fprintf(stderr, " }\n"); \ } while (0) #define print_disk_types() \ do { \ int i; \ fprintf(stderr, "TYPE := { "); \ fprintf(stderr, "%s", td_disk_types[0]); \ for (i = 1; i < TD_TYPE_INVALID; i++) \ fprintf(stderr, " | %s", td_disk_types[i]); \ fprintf(stderr, " }\n"); \ } while (0); #define print_field_names() \ do { \ int i; \ fprintf(stderr, "FIELD := { "); \ fprintf(stderr, "%s", td_vdi_fields[0].name); \ for (i = 1; i < TD_FIELD_INVALID; i++) \ fprintf(stderr, " | %s", td_vdi_fields[i].name); \ fprintf(stderr, " }\n"); \ } while (0) void help(void) { fprintf(stderr, "Tapdisk Utilities: v1.0.0\n"); fprintf(stderr, "usage: td-util COMMAND [TYPE] [OPTIONS]\n"); print_commands(); print_disk_types(); exit(-1); } struct command * get_command(char *command) { int i; for (i = 0; i < TD_CMD_INVALID; i++) if (!strcmp(command, commands[i].name)) return &commands[i]; return NULL; } struct vdi_field * get_field(char *field) { int i; for (i = 0; i < TD_FIELD_INVALID; i++) if (!strcmp(field, td_vdi_fields[i].name)) return &td_vdi_fields[i]; return NULL; } int get_driver_type(char *type) { int i; if (strnlen(type, 25) >= 25) return -ENAMETOOLONG; for (i = 0; i < TD_TYPE_INVALID; i++) if (!strcmp(type, td_disk_types[i])) return i; return -TD_TYPE_INVALID; } int td_create(int type, int argc, char *argv[]) { ssize_t mb; uint64_t size; char *name, *buf; int c, i, fd, sparse = 1, fixedsize = 0; while ((c = getopt(argc, argv, "hrb")) != -1) { switch(c) { case 'r': sparse = 0; break; case 'b': fixedsize = 1; break; default: fprintf(stderr, "Unknown option %c\n", (char)c); case 'h': goto usage; } } if (optind != (argc - 2)) goto usage; mb = 1 << 20; size = atoi(argv[optind++]); size = size << 20; name = argv[optind]; if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) { fprintf(stderr, "Device name too long\n"); return ENAMETOOLONG; } if (type == TD_TYPE_VHD) { int cargc = 0; char sbuf[32], *cargv[10]; size >>= 20; memset(cargv, 0, sizeof(cargv)); snprintf(sbuf, sizeof(sbuf) - 1, "%"PRIu64, size); cargv[cargc++] = "create"; cargv[cargc++] = "-n"; cargv[cargc++] = name; cargv[cargc++] = "-s"; cargv[cargc++] = sbuf; if (!sparse) cargv[cargc++] = "-r"; if (fixedsize) cargv[cargc++] = "-b"; return vhd_util_create(cargc, cargv); } /* generic create */ if (sparse) { fprintf(stderr, "Cannot create sparse %s image\n", td_disk_types[type]); return EINVAL; } buf = calloc(1, mb); if (!buf) return ENOMEM; fd = open(name, O_WRONLY | O_DIRECT | O_CREAT | O_TRUNC, 0644); if (fd == -1) { free(buf); return errno; } size >>= 20; for (i = 0; i < size; i++) if (write(fd, buf, mb) != mb) { close(fd); unlink(name); free(buf); return EIO; } close(fd); free(buf); return 0; usage: fprintf(stderr, "usage: td-util create %s [-h help] [-r reserve] " "[-b file_is_fixed_size] \n", td_disk_types[type]); return EINVAL; } int td_snapshot(int type, int argc, char *argv[]) { char *cargv[10]; int c, err, cargc; struct stat stats; char *name, *backing, *limit = NULL; int fixedsize = 0, rawparent = 0; if (type != TD_TYPE_VHD) { fprintf(stderr, "Cannot create snapshot of %s image type\n", td_disk_types[type]); return EINVAL; } while ((c = getopt(argc, argv, "hbml:")) != -1) { switch(c) { case 'b': fixedsize = 1; break; case 'm': rawparent = 1; break; case 'l': limit = optarg; break; case 'h': err = 0; goto usage; default: err = EINVAL; goto usage; } } if (optind != (argc - 2)) { err = EINVAL; goto usage; } name = argv[optind++]; backing = argv[optind++]; if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN || strnlen(backing, MAX_NAME_LEN) == MAX_NAME_LEN) { fprintf(stderr, "Device name too long\n"); return ENAMETOOLONG; } if (stat(backing, &stats) == -1) { fprintf(stderr, "File %s not found\n", backing); return errno; } cargc = 0; memset(cargv, 0, sizeof(cargv)); cargv[cargc++] = "snapshot"; cargv[cargc++] = "-n"; cargv[cargc++] = name; cargv[cargc++] = "-p"; cargv[cargc++] = backing; if (fixedsize) cargv[cargc++] = "-b"; if (rawparent) cargv[cargc++] = "-m"; if (limit) { cargv[cargc++] = "-l"; cargv[cargc++] = limit; } return vhd_util_snapshot(cargc, cargv); usage: fprintf(stderr, "usage: td-util snapshot %s [-h help] [-m parent_raw] " "[-b file_is_fixed_size] [-l snapshot depth limit] " " \n", td_disk_types[type]); return err; } int td_coalesce(int type, int argc, char *argv[]) { int c, ret, cargc; char *name, *cargv[3]; if (type != TD_TYPE_VHD) { fprintf(stderr, "Cannot create snapshot of %s image type\n", td_disk_types[type]); return EINVAL; } while ((c = getopt(argc, argv, "h")) != -1) { switch(c) { default: fprintf(stderr, "Unknown option %c\n", (char)c); case 'h': goto usage; } } if (optind != (argc - 1)) goto usage; name = argv[optind++]; if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) { fprintf(stderr, "Device name too long\n"); return ENAMETOOLONG; } cargc = 0; memset(cargv, 0, sizeof(cargv)); cargv[cargc++] = "coalesce"; cargv[cargc++] = "-n"; cargv[cargc++] = name; ret = vhd_util_coalesce(cargc, cargv); if (ret) printf("coalesce failed: %d\n", ret); return ret; usage: fprintf(stderr, "usage: td-util coalesce %s [-h help] " "\n", td_disk_types[type]); return EINVAL; } int td_query(int type, int argc, char *argv[]) { char *name; int c, size = 0, parent = 0, fields = 0, depth = 0, err = 0; while ((c = getopt(argc, argv, "hvpfd")) != -1) { switch(c) { case 'v': size = 1; break; case 'p': parent = 1; break; case 'f': fields = 1; break; case 'd': depth = 1; break; case 'h': err = 0; goto usage; default: err = EINVAL; goto usage; } } if (optind != (argc - 1)) { err = EINVAL; goto usage; } name = argv[optind++]; if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) { fprintf(stderr, "Device name too long\n"); return ENAMETOOLONG; } if (type == TD_TYPE_VHD) { vhd_context_t vhd; err = vhd_open(&vhd, name, VHD_OPEN_RDONLY); if (err) { printf("failed opening %s: %d\n", name, err); return err; } if (size) printf("%"PRIu64"\n", vhd.footer.curr_size >> 20); if (parent) { if (vhd.footer.type != HD_TYPE_DIFF) printf("%s has no parent\n", name); else { char *pname; err = vhd_parent_locator_get(&vhd, &pname); if (err) printf("failed getting parent: %d\n", err); else { printf("%s\n", pname); free(pname); } } } if (fields) { int ret, hidden; ret = vhd_hidden(&vhd, &hidden); if (ret) { printf("failed checking 'hidden' field: %d\n", ret); err = (err ? : ret); } else printf("%s: %d\n", td_vdi_fields[TD_FIELD_HIDDEN].name, hidden); } if (depth) { int ret, length; ret = vhd_chain_depth(&vhd, &length); if (ret) printf("error checking chain depth: %d\n", ret); else printf("chain depth: %d\n", length); err = (err ? : ret); } vhd_close(&vhd); } else if (type == TD_TYPE_AIO) { if (size) { int fd; uint64_t secs; uint32_t ssize; fd = open(name, O_RDONLY | O_LARGEFILE); if (fd == -1) { printf("failed opening %s: %d\n", name, errno); return -errno; } err = tapdisk_get_image_size(fd, &secs, &ssize); close(fd); if (err) { printf("failed getting size for %s: %d\n:", name, err); return err; } printf("%"PRIu64"\n", secs >> 11); } if (parent) printf("%s has no parent\n", name); if (fields) { int i; for (i = 0; i < TD_FIELD_INVALID; i++) printf("%s: 0\n", td_vdi_fields[i].name); } } return err; usage: fprintf(stderr, "usage: td-util query %s [-h help] [-v virtsize] " "[-p parent] [-f fields] \n", td_disk_types[type]); return err; } int td_set_field(int type, int argc, char *argv[]) { int c, cargc; struct vdi_field *field; char *name, *value, *cargv[7]; if (type != TD_TYPE_VHD) { fprintf(stderr, "Cannot set fields of %s images\n", td_disk_types[type]); return EINVAL; } while ((c = getopt(argc, argv, "h")) != -1) { switch(c) { default: fprintf(stderr, "Unknown option %c\n", (char)c); case 'h': goto usage; } } if (optind != (argc - 3)) goto usage; name = argv[optind++]; field = get_field(argv[optind]); if (!field || field->id != TD_FIELD_HIDDEN) { fprintf(stderr, "Invalid field %s\n", argv[optind]); goto usage; } value = argv[++optind]; cargc = 0; memset(cargv, 0, sizeof(cargv)); cargv[cargc++] = "set"; cargv[cargc++] = "-n"; cargv[cargc++] = name; cargv[cargc++] = "-f"; cargv[cargc++] = field->name; cargv[cargc++] = "-v"; cargv[cargc++] = value; return vhd_util_set_field(cargc, cargv); usage: fprintf(stderr, "usage: td-util set %s [-h help] " " \n", td_disk_types[type]); print_field_names(); return EINVAL; } int main(int argc, char *argv[]) { char **cargv; struct command *cmd; int cargc, i, type = -1, ret = 0; #ifdef CORE_DUMP struct rlimit rlim; rlim.rlim_cur = RLIM_INFINITY; rlim.rlim_max = RLIM_INFINITY; if (setrlimit(RLIMIT_CORE, &rlim) < 0) fprintf(stderr, "setrlimit failed: %d\n", errno); #endif if (argc < 2) help(); cargc = argc - 1; cmd = get_command(argv[1]); if (!cmd) { fprintf(stderr, "invalid COMMAND %s\n", argv[1]); help(); } if (cmd->needs_type) { if (argc < 3) { fprintf(stderr, "td-util %s requires a TYPE\n", cmd->name); print_disk_types(); exit(-1); } type = get_driver_type(argv[2]); if (type < 0) { fprintf(stderr, "invalid TYPE '%s'.\n", argv[2]); print_disk_types(); exit(-1); } --cargc; } cargv = malloc(sizeof(char *) * cargc); if (!cargv) exit(ENOMEM); cargv[0] = cmd->name; for (i = 1; i < cargc; i++) cargv[i] = argv[i + (argc - cargc)]; switch(cmd->id) { case TD_CMD_CREATE: ret = td_create(type, cargc, cargv); break; case TD_CMD_SNAPSHOT: ret = td_snapshot(type, cargc, cargv); break; /* case TD_CMD_COALESCE: ret = td_coalesce(type, cargc, cargv); break; */ case TD_CMD_QUERY: ret = td_query(type, cargc, cargv); break; /* case TD_CMD_RESIZE: ret = td_resize(type, cargc, cargv); break; */ case TD_CMD_SET: ret = td_set_field(type, cargc, cargv); break; /* case TD_CMD_REPAIR: ret = td_repair(type, cargc, cargv); break; case TD_CMD_FILL: ret = td_fill(type, cargc, cargv); break; case TD_CMD_READ: ret = td_read(type, cargc, cargv); break; */ default: case TD_CMD_INVALID: ret = EINVAL; break; } free(cargv); return (ret >= 0 ? ret : -ret); } blktap-2.0.90/drivers/atomicio.h0000644000000000000000000000311611664745551015227 0ustar rootroot/* $OpenBSD: atomicio.h,v 1.6 2005/05/24 17:32:43 avsm Exp $ */ /* * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Ensure all of data on socket comes through. f==read || f==vwrite */ size_t atomicio(ssize_t (*)(int, void *, size_t), int, void *, size_t); #define vwrite (ssize_t (*)(int, void *, size_t))write blktap-2.0.90/drivers/block-valve.h0000644000000000000000000000354211664745551015633 0ustar rootroot/* * Copyright (c) 2011, Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TAPDISK_VALVE_H_ #define _TAPDISK_VALVE_H_ #define TD_VALVE_SOCKDIR "/var/run/blktap/ratelimit" #define TD_RLB_CONN_MAX 1024 #define TD_RLB_REQUEST_MAX (8 << 20) struct td_valve_req { unsigned long need; unsigned long done; }; #endif /* _TAPDISK_VALVE_H_ */ blktap-2.0.90/drivers/tapdisk-interface.c0000644000000000000000000001432111664745551017013 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include "tapdisk.h" #include "tapdisk-vbd.h" #include "tapdisk-image.h" #include "tapdisk-driver.h" #include "tapdisk-server.h" #include "tapdisk-interface.h" #include "tapdisk-log.h" int td_load(td_image_t *image) { td_image_t *shared; td_driver_t *driver; shared = tapdisk_server_get_shared_image(image); if (!shared) return -ENODEV; driver = shared->driver; if (!driver) return -EBADF; driver->refcnt++; image->driver = driver; image->info = driver->info; DPRINTF("loaded shared image %s (%d users, state: 0x%08x, type: %d)\n", driver->name, driver->refcnt, driver->state, driver->type); return 0; } int __td_open(td_image_t *image, td_disk_info_t *info) { int err; td_driver_t *driver; driver = image->driver; if (!driver) { driver = tapdisk_driver_allocate(image->type, image->name, image->flags); if (!driver) return -ENOMEM; if (info) /* pre-seed driver->info for virtual drivers */ driver->info = *info; } if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) { err = driver->ops->td_open(driver, image->name, image->flags); if (err) { if (!image->driver) tapdisk_driver_free(driver); return err; } td_flag_set(driver->state, TD_DRIVER_OPEN); DPRINTF("opened image %s (%d users, state: 0x%08x, type: %d, %s)\n", driver->name, driver->refcnt + 1, driver->state, driver->type, td_flag_test(image->flags, TD_OPEN_RDONLY) ? "ro" : "rw"); } image->driver = driver; image->info = driver->info; driver->refcnt++; return 0; } int td_open(td_image_t *image) { return __td_open(image, NULL); } int td_close(td_image_t *image) { td_driver_t *driver; driver = image->driver; if (!driver) return -ENODEV; driver->refcnt--; if (!driver->refcnt && td_flag_test(driver->state, TD_DRIVER_OPEN)) { driver->ops->td_close(driver); td_flag_clear(driver->state, TD_DRIVER_OPEN); } DPRINTF("closed image %s (%d users, state: 0x%08x, type: %d)\n", driver->name, driver->refcnt, driver->state, driver->type); return 0; } int td_get_parent_id(td_image_t *image, td_disk_id_t *id) { td_driver_t *driver; driver = image->driver; if (!driver) return -ENODEV; if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) return -EBADF; return driver->ops->td_get_parent_id(driver, id); } int td_validate_parent(td_image_t *image, td_image_t *parent) { td_driver_t *driver, *pdriver; driver = image->driver; pdriver = parent->driver; if (!driver || !pdriver) return -ENODEV; if (!td_flag_test(driver->state, TD_DRIVER_OPEN) || !td_flag_test(pdriver->state, TD_DRIVER_OPEN)) return -EBADF; return 0; return driver->ops->td_validate_parent(driver, pdriver, 0); } void td_queue_write(td_image_t *image, td_request_t treq) { int err; td_driver_t *driver; driver = image->driver; if (!driver) { err = -ENODEV; goto fail; } if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) { err = -EBADF; goto fail; } if (!driver->ops->td_queue_write) { err = -EOPNOTSUPP; goto fail; } err = tapdisk_image_check_td_request(image, treq); if (err) goto fail; driver->ops->td_queue_write(driver, treq); return; fail: td_complete_request(treq, err); } void td_queue_read(td_image_t *image, td_request_t treq) { int err; td_driver_t *driver; driver = image->driver; if (!driver) { err = -ENODEV; goto fail; } if (!td_flag_test(driver->state, TD_DRIVER_OPEN)) { err = -EBADF; goto fail; } if (!driver->ops->td_queue_read) { err = -EOPNOTSUPP; goto fail; } err = tapdisk_image_check_td_request(image, treq); if (err) goto fail; driver->ops->td_queue_read(driver, treq); return; fail: td_complete_request(treq, err); } void td_forward_request(td_request_t treq) { tapdisk_vbd_forward_request(treq); } void td_complete_request(td_request_t treq, int res) { treq.cb(treq, res); } void td_queue_tiocb(td_driver_t *driver, struct tiocb *tiocb) { tapdisk_driver_queue_tiocb(driver, tiocb); } void td_prep_read(struct tiocb *tiocb, int fd, char *buf, size_t bytes, long long offset, td_queue_callback_t cb, void *arg) { tapdisk_prep_tiocb(tiocb, fd, 0, buf, bytes, offset, cb, arg); } void td_prep_write(struct tiocb *tiocb, int fd, char *buf, size_t bytes, long long offset, td_queue_callback_t cb, void *arg) { tapdisk_prep_tiocb(tiocb, fd, 1, buf, bytes, offset, cb, arg); } void td_debug(td_image_t *image) { td_driver_t *driver; driver = image->driver; if (!driver || !td_flag_test(driver->state, TD_DRIVER_OPEN)) return; tapdisk_driver_debug(driver); } __noreturn void td_panic(void) { tlog_precious(); raise(SIGABRT); _exit(-1); /* not reached */ } blktap-2.0.90/drivers/block-vhd.c0000644000000000000000000016074711664745551015305 0ustar rootroot/* * * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * block-vhd.c: asynchronous vhd implementation. * * A note on write transactions: * Writes that require updating the BAT or bitmaps cannot be signaled * as complete until all updates have reached disk. Transactions are * used to ensure proper ordering in these cases. The two types of * transactions are as follows: * - Bitmap updates only: data writes that require updates to the same * bitmap are grouped in a transaction. Only after all data writes * in a transaction complete does the bitmap write commence. Only * after the bitmap write finishes are the data writes signalled as * complete. * - BAT and bitmap updates: data writes are grouped in transactions * as above, but a special extra write is included in the transaction, * which zeros out the newly allocated bitmap on disk. When the data * writes and the zero-bitmap write complete, the BAT and bitmap writes * are started in parallel. The transaction is completed only after both * the BAT and bitmap writes successfully return. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include /* For whatever reason, Linux packages this in */ /* e2fsprogs-devel. */ #include /* for memset. */ #include #include #include "libvhd.h" #include "tapdisk.h" #include "tapdisk-driver.h" #include "tapdisk-interface.h" #include "tapdisk-disktype.h" #include "tapdisk-storage.h" unsigned int SPB; #define DEBUGGING 2 #define ASSERTING 1 #define MICROSOFT_COMPAT #define VHD_BATMAP_MAX_RETRIES 10 #define __TRACE(s) \ do { \ DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %" \ PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: " \ "%u, BBLK: 0x%04x\n", \ s->vhd.file, s->queued, s->completed, s->returned, \ VHD_REQS_DATA - s->vreq_free_count, \ s->bat.pbw_blk); \ } while(0) #define __ASSERT(_p) \ if (!(_p)) { \ DPRINTF("%s:%d: FAILED ASSERTION: '%s'\n", \ __FILE__, __LINE__, #_p); \ DBG(TLOG_WARN, "%s:%d: FAILED ASSERTION: '%s'\n", \ __FILE__, __LINE__, #_p); \ td_panic(); \ } #if (DEBUGGING == 1) #define DBG(level, _f, _a...) DPRINTF(_f, ##_a) #define ERR(_s, err, _f, _a...) DPRINTF("ERROR: %d: " _f, err, ##_a) #define TRACE(s) ((void)0) #elif (DEBUGGING == 2) #define DBG(level, _f, _a...) tlog_write(level, _f, ##_a) #define ERR(_s, _err, _f, _a...) tlog_drv_error((_s)->driver, _err, _f, ##_a) #define TRACE(s) __TRACE(s) #else #define DBG(level, _f, _a...) ((void)0) #define ERR(_s, err, _f, _a...) ((void)0) #define TRACE(s) ((void)0) #endif #if (ASSERTING == 1) #define ASSERT(_p) __ASSERT(_p) #else #define ASSERT(_p) ((void)0) #endif /******VHD DEFINES******/ #define VHD_CACHE_SIZE 32 #define VHD_REQS_DATA TAPDISK_DATA_REQUESTS #define VHD_REQS_META (VHD_CACHE_SIZE + 2) #define VHD_REQS_TOTAL (VHD_REQS_DATA + VHD_REQS_META) #define VHD_OP_BAT_WRITE 0 #define VHD_OP_DATA_READ 1 #define VHD_OP_DATA_WRITE 2 #define VHD_OP_BITMAP_READ 3 #define VHD_OP_BITMAP_WRITE 4 #define VHD_OP_ZERO_BM_WRITE 5 #define VHD_OP_REDUNDANT_BM_WRITE 6 #define VHD_BM_BAT_LOCKED 0 #define VHD_BM_BAT_CLEAR 1 #define VHD_BM_BIT_CLEAR 2 #define VHD_BM_BIT_SET 3 #define VHD_BM_NOT_CACHED 4 #define VHD_BM_READ_PENDING 5 #define VHD_FLAG_OPEN_RDONLY 1 #define VHD_FLAG_OPEN_NO_CACHE 2 #define VHD_FLAG_OPEN_QUIET 4 #define VHD_FLAG_OPEN_STRICT 8 #define VHD_FLAG_OPEN_QUERY 16 #define VHD_FLAG_OPEN_PREALLOCATE 32 #define VHD_FLAG_BAT_LOCKED 1 #define VHD_FLAG_BAT_WRITE_STARTED 2 #define VHD_FLAG_BM_UPDATE_BAT 1 #define VHD_FLAG_BM_WRITE_PENDING 2 #define VHD_FLAG_BM_READ_PENDING 4 #define VHD_FLAG_BM_LOCKED 8 #define VHD_FLAG_REQ_UPDATE_BAT 1 #define VHD_FLAG_REQ_UPDATE_BITMAP 2 #define VHD_FLAG_REQ_QUEUED 4 #define VHD_FLAG_REQ_FINISHED 8 #define VHD_FLAG_TX_LIVE 1 #define VHD_FLAG_TX_UPDATE_BAT 2 typedef uint8_t vhd_flag_t; struct vhd_state; struct vhd_request; struct vhd_req_list { struct vhd_request *head; struct vhd_request *tail; }; struct vhd_transaction { int error; int closed; int started; int finished; vhd_flag_t status; struct vhd_req_list requests; }; struct vhd_request { int error; uint8_t op; vhd_flag_t flags; td_request_t treq; struct tiocb tiocb; struct vhd_state *state; struct vhd_request *next; struct vhd_transaction *tx; }; struct vhd_bat_state { vhd_bat_t bat; vhd_batmap_t batmap; vhd_flag_t status; uint32_t pbw_blk; /* blk num of pending write */ uint64_t pbw_offset; /* file offset of same */ struct vhd_request req; /* for writing bat table */ struct vhd_request zero_req; /* for initializing bitmaps */ char *bat_buf; }; struct vhd_bitmap { uint32_t blk; uint64_t seqno; /* lru sequence number */ vhd_flag_t status; char *map; /* map should only be modified * in finish_bitmap_write */ char *shadow; /* in-memory bitmap changes are * made to shadow and copied to * map only after having been * flushed to disk */ struct vhd_transaction tx; /* transaction data structure * encapsulating data, bitmap, * and bat writes */ struct vhd_req_list queue; /* data writes waiting for next * transaction */ struct vhd_req_list waiting; /* pending requests that cannot * be serviced until this bitmap * is read from disk */ struct vhd_request req; }; struct vhd_state { vhd_flag_t flags; /* VHD stuff */ vhd_context_t vhd; uint32_t spp; /* sectors per page */ uint32_t spb; /* sectors per block */ uint64_t first_db; /* pointer to datablock 0 */ uint64_t next_db; /* pointer to the next * (unallocated) datablock */ struct vhd_bat_state bat; uint64_t bm_lru; /* lru sequence number */ uint32_t bm_secs; /* size of bitmap, in sectors */ struct vhd_bitmap *bitmap[VHD_CACHE_SIZE]; int bm_free_count; struct vhd_bitmap *bitmap_free[VHD_CACHE_SIZE]; struct vhd_bitmap bitmap_list[VHD_CACHE_SIZE]; int vreq_free_count; struct vhd_request *vreq_free[VHD_REQS_DATA]; struct vhd_request vreq_list[VHD_REQS_DATA]; /* for redundant bitmap writes */ int padbm_size; char *padbm_buf; long int debug_skipped_redundant_writes; long int debug_done_redundant_writes; td_driver_t *driver; uint64_t queued; uint64_t completed; uint64_t returned; uint64_t reads; uint64_t read_size; uint64_t writes; uint64_t write_size; }; #define test_vhd_flag(word, flag) ((word) & (flag)) #define set_vhd_flag(word, flag) ((word) |= (flag)) #define clear_vhd_flag(word, flag) ((word) &= ~(flag)) #define bat_entry(s, blk) ((s)->bat.bat.bat[(blk)]) static void vhd_complete(void *, struct tiocb *, int); static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *); static struct vhd_state *_vhd_master; static unsigned long _vhd_zsize; static char *_vhd_zeros; static int vhd_initialize(struct vhd_state *s) { if (_vhd_zeros) return 0; _vhd_zsize = 2 * getpagesize(); if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) _vhd_zsize += VHD_BLOCK_SIZE; _vhd_zeros = mmap(0, _vhd_zsize, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (_vhd_zeros == MAP_FAILED) { EPRINTF("vhd_initialize failed: %d\n", -errno); _vhd_zeros = NULL; _vhd_zsize = 0; return -errno; } _vhd_master = s; return 0; } static void vhd_free(struct vhd_state *s) { if (_vhd_master != s || !_vhd_zeros) return; munmap(_vhd_zeros, _vhd_zsize); _vhd_zsize = 0; _vhd_zeros = NULL; _vhd_master = NULL; } static char * _get_vhd_zeros(const char *func, unsigned long size) { if (!_vhd_zeros || _vhd_zsize < size) { EPRINTF("invalid zero request from %s: %lu, %lu, %p\n", func, size, _vhd_zsize, _vhd_zeros); ASSERT(0); } return _vhd_zeros; } #define vhd_zeros(size) _get_vhd_zeros(__func__, size) static inline void set_batmap(struct vhd_state *s, uint32_t blk) { if (s->bat.batmap.map) { vhd_batmap_set(&s->vhd, &s->bat.batmap, blk); DBG(TLOG_DBG, "block 0x%x completely full\n", blk); } } static inline int test_batmap(struct vhd_state *s, uint32_t blk) { if (!s->bat.batmap.map) return 0; return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk); } static int vhd_kill_footer(struct vhd_state *s) { int err; off64_t end; void *zeros; if (s->vhd.footer.type == HD_TYPE_FIXED) return 0; err = posix_memalign(&zeros, 512, 512); if (err) return -err; err = 1; memset(zeros, 0xc7c7c7c7, 512); if ((end = lseek64(s->vhd.fd, 0, SEEK_END)) == -1) goto fail; if (lseek64(s->vhd.fd, (end - 512), SEEK_SET) == -1) goto fail; if (write(s->vhd.fd, zeros, 512) != 512) goto fail; err = 0; fail: free(zeros); if (err) return (errno ? -errno : -EIO); return 0; } static inline int find_next_free_block(struct vhd_state *s) { int err; off64_t eom; uint32_t i, entry; err = vhd_end_of_headers(&s->vhd, &eom); if (err) return err; s->next_db = secs_round_up(eom); s->first_db = s->next_db; if ((s->first_db + s->bm_secs) % s->spp) s->first_db += (s->spp - ((s->first_db + s->bm_secs) % s->spp)); for (i = 0; i < s->bat.bat.entries; i++) { entry = bat_entry(s, i); if (entry != DD_BLK_UNUSED && entry >= s->next_db) s->next_db = entry + s->spb + s->bm_secs; } return 0; } static void vhd_free_bat(struct vhd_state *s) { free(s->bat.bat.bat); free(s->bat.batmap.map); free(s->bat.bat_buf); memset(&s->bat, 0, sizeof(struct vhd_bat)); } static int vhd_initialize_bat(struct vhd_state *s) { int err, batmap_required, i; void *buf; memset(&s->bat, 0, sizeof(struct vhd_bat)); err = vhd_read_bat(&s->vhd, &s->bat.bat); if (err) { EPRINTF("%s: reading bat: %d\n", s->vhd.file, err); return err; } batmap_required = 1; if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) { batmap_required = 0; } else { err = find_next_free_block(s); if (err) goto fail; } if (vhd_has_batmap(&s->vhd)) { for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) { err = vhd_read_batmap(&s->vhd, &s->bat.batmap); if (err) { EPRINTF("%s: reading batmap: %d\n", s->vhd.file, err); if (batmap_required) goto fail; } else { break; } } if (err) EPRINTF("%s: ignoring non-critical batmap error\n", s->vhd.file); } err = posix_memalign(&buf, VHD_SECTOR_SIZE, VHD_SECTOR_SIZE); if (err) goto fail; s->bat.bat_buf = buf; return 0; fail: vhd_free_bat(s); return err; } static void vhd_free_bitmap_cache(struct vhd_state *s) { int i; struct vhd_bitmap *bm; for (i = 0; i < VHD_CACHE_SIZE; i++) { bm = s->bitmap_list + i; free(bm->map); free(bm->shadow); s->bitmap_free[i] = NULL; } memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE); } static int vhd_initialize_bitmap_cache(struct vhd_state *s) { int i, err, map_size; struct vhd_bitmap *bm; void *map, *shadow; memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE); s->bm_lru = 0; map_size = vhd_sectors_to_bytes(s->bm_secs); s->bm_free_count = VHD_CACHE_SIZE; for (i = 0; i < VHD_CACHE_SIZE; i++) { bm = s->bitmap_list + i; err = posix_memalign(&map, 512, map_size); if (err) goto fail; bm->map = map; err = posix_memalign(&shadow, 512, map_size); if (err) goto fail; bm->shadow = shadow; memset(bm->map, 0, map_size); memset(bm->shadow, 0, map_size); s->bitmap_free[i] = bm; } return 0; fail: vhd_free_bitmap_cache(s); return err; } static int vhd_initialize_dynamic_disk(struct vhd_state *s) { uint32_t bm_size; void *buf; int err; err = vhd_get_header(&s->vhd); if (err) { if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) EPRINTF("Error reading VHD DD header.\n"); return err; } if (s->vhd.header.hdr_ver != 0x00010000) { EPRINTF("unsupported header version! (0x%x)\n", s->vhd.header.hdr_ver); return -EINVAL; } s->spp = getpagesize() >> VHD_SECTOR_SHIFT; s->spb = s->vhd.header.block_size >> VHD_SECTOR_SHIFT; s->bm_secs = secs_round_up_no_zero(s->spb >> 3); s->padbm_size = (s->bm_secs / getpagesize()) * getpagesize(); if (s->bm_secs % getpagesize()) s->padbm_size += getpagesize(); err = posix_memalign(&buf, 512, s->padbm_size); if (err) return -err; s->padbm_buf = buf; bm_size = s->bm_secs << VHD_SECTOR_SHIFT; memset(s->padbm_buf, 0, s->padbm_size - bm_size); memset(s->padbm_buf + (s->padbm_size - bm_size), ~0, bm_size); s->debug_skipped_redundant_writes = 0; s->debug_done_redundant_writes = 0; if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE)) return 0; err = vhd_initialize_bat(s); if (err) return err; err = vhd_initialize_bitmap_cache(s); if (err) { vhd_free_bat(s); return err; } return 0; } static int vhd_check_version(struct vhd_state *s) { if (strncmp(s->vhd.footer.crtr_app, "tap", 3)) return 0; if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) { if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) EPRINTF("WARNING: %s vhd creator version 0x%08x, " "but only versions up to 0x%08x are " "supported for IO\n", s->vhd.file, s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION); return -EINVAL; } return 0; } static void vhd_log_open(struct vhd_state *s) { char buf[5]; uint32_t i, allocated, full; if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) return; snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app); if (!vhd_type_dynamic(&s->vhd)) { DPRINTF("%s version: %s 0x%08x\n", s->vhd.file, buf, s->vhd.footer.crtr_ver); return; } allocated = 0; full = 0; for (i = 0; i < s->bat.bat.entries; i++) { if (bat_entry(s, i) != DD_BLK_UNUSED) allocated++; if (test_batmap(s, i)) full++; } DPRINTF("%s version: %s 0x%08x, b: %u, a: %u, f: %u, n: %"PRIu64"\n", s->vhd.file, buf, s->vhd.footer.crtr_ver, s->bat.bat.entries, allocated, full, s->next_db); } static int __vhd_open(td_driver_t *driver, const char *name, vhd_flag_t flags) { int i, o_flags, err; struct vhd_state *s; DBG(TLOG_INFO, "vhd_open: %s\n", name); if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT)) libvhd_set_log_level(1); s = (struct vhd_state *)driver->data; memset(s, 0, sizeof(struct vhd_state)); s->flags = flags; s->driver = driver; err = vhd_initialize(s); if (err) return err; o_flags = ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) ? VHD_OPEN_RDONLY : VHD_OPEN_RDWR); if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT)) set_vhd_flag(o_flags, VHD_OPEN_STRICT); err = vhd_open(&s->vhd, name, o_flags); if (err) { libvhd_set_log_level(1); err = vhd_open(&s->vhd, name, o_flags); if (err) { EPRINTF("Unable to open [%s] (%d)!\n", name, err); return err; } } err = vhd_check_version(s); if (err) goto fail; s->spb = s->spp = 1; if (vhd_type_dynamic(&s->vhd)) { err = vhd_initialize_dynamic_disk(s); if (err) goto fail; } vhd_log_open(s); SPB = s->spb; s->vreq_free_count = VHD_REQS_DATA; for (i = 0; i < VHD_REQS_DATA; i++) s->vreq_free[i] = s->vreq_list + i; driver->info.size = s->vhd.footer.curr_size >> VHD_SECTOR_SHIFT; driver->info.sector_size = VHD_SECTOR_SIZE; driver->info.info = 0; DBG(TLOG_INFO, "vhd_open: done (sz:%"PRIu64", sct:%lu, inf:%u)\n", driver->info.size, driver->info.sector_size, driver->info.info); if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT) && !test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) { err = vhd_kill_footer(s); if (err) { DPRINTF("ERROR killing footer: %d\n", err); goto fail; } s->writes++; } return 0; fail: vhd_free_bat(s); vhd_free_bitmap_cache(s); vhd_close(&s->vhd); vhd_free(s); return err; } static int _vhd_open(td_driver_t *driver, const char *name, td_flag_t flags) { vhd_flag_t vhd_flags = 0; if (flags & TD_OPEN_RDONLY) vhd_flags |= VHD_FLAG_OPEN_RDONLY; if (flags & TD_OPEN_QUIET) vhd_flags |= VHD_FLAG_OPEN_QUIET; if (flags & TD_OPEN_STRICT) vhd_flags |= VHD_FLAG_OPEN_STRICT; if (flags & TD_OPEN_QUERY) vhd_flags |= (VHD_FLAG_OPEN_QUERY | VHD_FLAG_OPEN_QUIET | VHD_FLAG_OPEN_RDONLY | VHD_FLAG_OPEN_NO_CACHE); /* pre-allocate for all but NFS and LVM storage */ driver->storage = tapdisk_storage_type(name); if (driver->storage != TAPDISK_STORAGE_TYPE_NFS && driver->storage != TAPDISK_STORAGE_TYPE_LVM) vhd_flags |= VHD_FLAG_OPEN_PREALLOCATE; return __vhd_open(driver, name, vhd_flags); } static void vhd_log_close(struct vhd_state *s) { uint32_t i, allocated, full; if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET)) return; allocated = 0; full = 0; for (i = 0; i < s->bat.bat.entries; i++) { if (bat_entry(s, i) != DD_BLK_UNUSED) allocated++; if (test_batmap(s, i)) full++; } DPRINTF("%s: b: %u, a: %u, f: %u, n: %"PRIu64"\n", s->vhd.file, s->bat.bat.entries, allocated, full, s->next_db); } static int _vhd_close(td_driver_t *driver) { int err; struct vhd_state *s; DBG(TLOG_WARN, "vhd_close\n"); s = (struct vhd_state *)driver->data; DPRINTF("gaps written/skipped: %ld/%ld\n", s->debug_done_redundant_writes, s->debug_skipped_redundant_writes); /* don't write footer if tapdisk is read-only */ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) goto free; /* * write footer if: * - we killed it on open (opened with strict) * - we've written data since opening */ if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_STRICT) || s->writes) { memcpy(&s->vhd.bat, &s->bat.bat, sizeof(vhd_bat_t)); err = vhd_write_footer(&s->vhd, &s->vhd.footer); memset(&s->vhd.bat, 0, sizeof(vhd_bat_t)); if (err) EPRINTF("writing %s footer: %d\n", s->vhd.file, err); if (!vhd_has_batmap(&s->vhd)) goto free; err = vhd_write_batmap(&s->vhd, &s->bat.batmap); if (err) EPRINTF("writing %s batmap: %d\n", s->vhd.file, err); } free: vhd_log_close(s); vhd_free_bat(s); vhd_free_bitmap_cache(s); vhd_close(&s->vhd); vhd_free(s); memset(s, 0, sizeof(struct vhd_state)); return 0; } int vhd_validate_parent(td_driver_t *child_driver, td_driver_t *parent_driver, td_flag_t flags) { struct vhd_state *child = (struct vhd_state *)child_driver->data; struct vhd_state *parent; if (parent_driver->type != DISK_TYPE_VHD) { if (child_driver->type != DISK_TYPE_VHD) return -EINVAL; if (child->vhd.footer.type != HD_TYPE_DIFF) return -EINVAL; if (!vhd_parent_raw(&child->vhd)) return -EINVAL; return 0; } parent = (struct vhd_state *)parent_driver->data; /* * This check removed because of cases like: * - parent VHD marked as 'hidden' * - parent VHD modified during coalesce */ /* if (stat(parent->vhd.file, &stats)) { DPRINTF("ERROR stating parent file %s\n", parent->vhd.file); return -errno; } if (child->hdr.prt_ts != vhd_time(stats.st_mtime)) { DPRINTF("ERROR: parent file has been modified since " "snapshot. Child image no longer valid.\n"); return -EINVAL; } */ if (uuid_compare(child->vhd.header.prt_uuid, parent->vhd.footer.uuid)) { DPRINTF("ERROR: %s: %s, %s: parent uuid has changed since " "snapshot. Child image no longer valid.\n", __func__, child->vhd.file, parent->vhd.file); return -EINVAL; } /* TODO: compare sizes */ return 0; } int vhd_get_parent_id(td_driver_t *driver, td_disk_id_t *id) { int err; char *parent; struct vhd_state *s; DBG(TLOG_DBG, "\n"); memset(id, 0, sizeof(td_disk_id_t)); s = (struct vhd_state *)driver->data; if (s->vhd.footer.type != HD_TYPE_DIFF) return TD_NO_PARENT; err = vhd_parent_locator_get(&s->vhd, &parent); if (err) return err; id->name = parent; id->type = vhd_parent_raw(&s->vhd) ? DISK_TYPE_AIO : DISK_TYPE_VHD; id->flags |= TD_OPEN_SHAREABLE|TD_OPEN_RDONLY; return 0; } static inline void clear_req_list(struct vhd_req_list *list) { list->head = list->tail = NULL; } static inline void add_to_tail(struct vhd_req_list *list, struct vhd_request *e) { if (!list->head) list->head = list->tail = e; else list->tail = list->tail->next = e; } static inline int remove_from_req_list(struct vhd_req_list *list, struct vhd_request *e) { struct vhd_request *i = list->head; if (list->head == e) { if (list->tail == e) clear_req_list(list); else list->head = list->head->next; return 0; } while (i->next) { if (i->next == e) { if (list->tail == e) { i->next = NULL; list->tail = i; } else i->next = i->next->next; return 0; } i = i->next; } return -EINVAL; } static inline void init_vhd_request(struct vhd_state *s, struct vhd_request *req) { memset(req, 0, sizeof(struct vhd_request)); req->state = s; } static inline void init_tx(struct vhd_transaction *tx) { memset(tx, 0, sizeof(struct vhd_transaction)); } static inline void add_to_transaction(struct vhd_transaction *tx, struct vhd_request *r) { ASSERT(!tx->closed); r->tx = tx; tx->started++; add_to_tail(&tx->requests, r); set_vhd_flag(tx->status, VHD_FLAG_TX_LIVE); DBG(TLOG_DBG, "blk: 0x%04"PRIx64", lsec: 0x%08"PRIx64", tx: %p, " "started: %d, finished: %d, status: %u\n", r->treq.sec / SPB, r->treq.sec, tx, tx->started, tx->finished, tx->status); } static inline int transaction_completed(struct vhd_transaction *tx) { return (tx->started == tx->finished); } static inline void init_bat(struct vhd_state *s) { s->bat.req.tx = NULL; s->bat.req.next = NULL; s->bat.req.error = 0; s->bat.pbw_blk = 0; s->bat.pbw_offset = 0; s->bat.status = 0; } static inline void lock_bat(struct vhd_state *s) { set_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED); } static inline void unlock_bat(struct vhd_state *s) { clear_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED); } static inline int bat_locked(struct vhd_state *s) { return test_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED); } static inline void init_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) { bm->blk = 0; bm->seqno = 0; bm->status = 0; init_tx(&bm->tx); clear_req_list(&bm->queue); clear_req_list(&bm->waiting); memset(bm->map, 0, vhd_sectors_to_bytes(s->bm_secs)); memset(bm->shadow, 0, vhd_sectors_to_bytes(s->bm_secs)); init_vhd_request(s, &bm->req); } static inline struct vhd_bitmap * get_bitmap(struct vhd_state *s, uint32_t block) { int i; struct vhd_bitmap *bm; for (i = 0; i < VHD_CACHE_SIZE; i++) { bm = s->bitmap[i]; if (bm && bm->blk == block) return bm; } return NULL; } static inline void lock_bitmap(struct vhd_bitmap *bm) { set_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED); } static inline void unlock_bitmap(struct vhd_bitmap *bm) { clear_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED); } static inline int bitmap_locked(struct vhd_bitmap *bm) { return test_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED); } static inline int bitmap_valid(struct vhd_bitmap *bm) { return !test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING); } static inline int bitmap_in_use(struct vhd_bitmap *bm) { return (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING) || test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING) || test_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT) || bm->waiting.head || bm->tx.requests.head || bm->queue.head); } static inline int bitmap_full(struct vhd_state *s, struct vhd_bitmap *bm) { int i, n; n = s->spb >> 3; for (i = 0; i < n; i++) if (bm->map[i] != (char)0xFF) return 0; DBG(TLOG_DBG, "bitmap 0x%04x full\n", bm->blk); return 1; } static struct vhd_bitmap * remove_lru_bitmap(struct vhd_state *s) { int i, idx = 0; uint64_t seq = s->bm_lru; struct vhd_bitmap *bm, *lru = NULL; for (i = 0; i < VHD_CACHE_SIZE; i++) { bm = s->bitmap[i]; if (bm && bm->seqno < seq && !bitmap_locked(bm)) { idx = i; lru = bm; seq = lru->seqno; } } if (lru) { s->bitmap[idx] = NULL; ASSERT(!bitmap_in_use(lru)); } return lru; } static int alloc_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap **bitmap, uint32_t blk) { struct vhd_bitmap *bm; *bitmap = NULL; if (s->bm_free_count > 0) { bm = s->bitmap_free[--s->bm_free_count]; } else { bm = remove_lru_bitmap(s); if (!bm) return -EBUSY; } init_vhd_bitmap(s, bm); bm->blk = blk; *bitmap = bm; return 0; } static inline uint64_t __bitmap_lru_seqno(struct vhd_state *s) { int i; struct vhd_bitmap *bm; if (s->bm_lru == 0xffffffff) { s->bm_lru = 0; for (i = 0; i < VHD_CACHE_SIZE; i++) { bm = s->bitmap[i]; if (bm) { bm->seqno >>= 1; if (bm->seqno > s->bm_lru) s->bm_lru = bm->seqno; } } } return ++s->bm_lru; } static inline void touch_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) { bm->seqno = __bitmap_lru_seqno(s); } static inline void install_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) { int i; for (i = 0; i < VHD_CACHE_SIZE; i++) { if (!s->bitmap[i]) { touch_bitmap(s, bm); s->bitmap[i] = bm; return; } } ASSERT(0); } static inline void free_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm) { int i; for (i = 0; i < VHD_CACHE_SIZE; i++) if (s->bitmap[i] == bm) break; ASSERT(!bitmap_locked(bm)); ASSERT(!bitmap_in_use(bm)); ASSERT(i < VHD_CACHE_SIZE); s->bitmap[i] = NULL; s->bitmap_free[s->bm_free_count++] = bm; } static int read_bitmap_cache(struct vhd_state *s, uint64_t sector, uint8_t op) { uint32_t blk, sec; struct vhd_bitmap *bm; /* in fixed disks, every block is present */ if (s->vhd.footer.type == HD_TYPE_FIXED) return VHD_BM_BIT_SET; blk = sector / s->spb; sec = sector % s->spb; if (blk > s->vhd.header.max_bat_size) { DPRINTF("ERROR: sec %"PRIu64" out of range, op = %d\n", sector, op); return -EINVAL; } if (bat_entry(s, blk) == DD_BLK_UNUSED) { if (op == VHD_OP_DATA_WRITE && s->bat.pbw_blk != blk && bat_locked(s)) return VHD_BM_BAT_LOCKED; return VHD_BM_BAT_CLEAR; } if (test_batmap(s, blk)) { DBG(TLOG_DBG, "batmap set for 0x%04x\n", blk); return VHD_BM_BIT_SET; } bm = get_bitmap(s, blk); if (!bm) return VHD_BM_NOT_CACHED; /* bump lru count */ touch_bitmap(s, bm); if (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)) return VHD_BM_READ_PENDING; return ((vhd_bitmap_test(&s->vhd, bm->map, sec)) ? VHD_BM_BIT_SET : VHD_BM_BIT_CLEAR); } static int read_bitmap_cache_span(struct vhd_state *s, uint64_t sector, int nr_secs, int value) { int ret; uint32_t blk, sec; struct vhd_bitmap *bm; /* in fixed disks, every block is present */ if (s->vhd.footer.type == HD_TYPE_FIXED) return nr_secs; sec = sector % s->spb; blk = sector / s->spb; if (test_batmap(s, blk)) return MIN(nr_secs, s->spb - sec); bm = get_bitmap(s, blk); ASSERT(bm && bitmap_valid(bm)); for (ret = 0; sec < s->spb && ret < nr_secs; sec++, ret++) if (vhd_bitmap_test(&s->vhd, bm->map, sec) != value) break; return ret; } static inline struct vhd_request * alloc_vhd_request(struct vhd_state *s) { struct vhd_request *req = NULL; if (s->vreq_free_count > 0) { req = s->vreq_free[--s->vreq_free_count]; ASSERT(req->treq.secs == 0); init_vhd_request(s, req); return req; } return NULL; } static inline void free_vhd_request(struct vhd_state *s, struct vhd_request *req) { memset(req, 0, sizeof(struct vhd_request)); s->vreq_free[s->vreq_free_count++] = req; } static inline void aio_read(struct vhd_state *s, struct vhd_request *req, uint64_t offset) { struct tiocb *tiocb = &req->tiocb; td_prep_read(tiocb, s->vhd.fd, req->treq.buf, vhd_sectors_to_bytes(req->treq.secs), offset, vhd_complete, req); td_queue_tiocb(s->driver, tiocb); s->queued++; s->reads++; s->read_size += req->treq.secs; TRACE(s); } static inline void aio_write(struct vhd_state *s, struct vhd_request *req, uint64_t offset) { struct tiocb *tiocb = &req->tiocb; td_prep_write(tiocb, s->vhd.fd, req->treq.buf, vhd_sectors_to_bytes(req->treq.secs), offset, vhd_complete, req); td_queue_tiocb(s->driver, tiocb); s->queued++; s->writes++; s->write_size += req->treq.secs; TRACE(s); } static inline uint64_t reserve_new_block(struct vhd_state *s, uint32_t blk) { int gap = 0; ASSERT(!test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED)); /* data region of segment should begin on page boundary */ if ((s->next_db + s->bm_secs) % s->spp) gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp)); s->bat.pbw_blk = blk; s->bat.pbw_offset = s->next_db + gap; return s->next_db; } static int schedule_bat_write(struct vhd_state *s) { int i; uint32_t blk; char *buf; uint64_t offset; struct vhd_request *req; ASSERT(bat_locked(s)); req = &s->bat.req; buf = s->bat.bat_buf; blk = s->bat.pbw_blk; init_vhd_request(s, req); memcpy(buf, &bat_entry(s, blk - (blk % 128)), 512); ((uint32_t *)buf)[blk % 128] = s->bat.pbw_offset; for (i = 0; i < 128; i++) BE32_OUT(&((uint32_t *)buf)[i]); offset = s->vhd.header.table_offset + (blk - (blk % 128)) * 4; req->treq.secs = 1; req->treq.buf = buf; req->op = VHD_OP_BAT_WRITE; req->next = NULL; aio_write(s, req, offset); set_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED); DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64", " "table_offset: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset, offset); return 0; } static void schedule_zero_bm_write(struct vhd_state *s, struct vhd_bitmap *bm, uint64_t lb_end) { uint64_t offset; struct vhd_request *req = &s->bat.zero_req; init_vhd_request(s, req); offset = vhd_sectors_to_bytes(lb_end); req->op = VHD_OP_ZERO_BM_WRITE; req->treq.sec = s->bat.pbw_blk * s->spb; req->treq.secs = (s->bat.pbw_offset - lb_end) + s->bm_secs; req->treq.buf = vhd_zeros(vhd_sectors_to_bytes(req->treq.secs)); req->next = NULL; DBG(TLOG_DBG, "blk: 0x%04x, writing zero bitmap at 0x%08"PRIx64"\n", s->bat.pbw_blk, offset); lock_bitmap(bm); add_to_transaction(&bm->tx, req); aio_write(s, req, offset); } /* This is a performance optimization. When writing sequentially into full * blocks, skipping (up-to-date) bitmaps causes an approx. 25% reduction in * throughput. To prevent skipping, we issue redundant writes into the (padded) * bitmap area just to make all writes sequential. This will help VHDs on raw * block devices, while the FS-based VHDs shouldn't suffer much. * * Note that it only makes sense to perform this reduntant bitmap write if the * block is completely full (i.e. the batmap entry is set). If the block is not * completely full then one of the following two things will be true: * 1. we'll either be allocating new sectors in this block and writing its * bitmap transactionally, which will be slow anyways; or * 2. the IO will be skipping over the unallocated sectors again, so the * pattern will not be sequential anyways * In either case a redundant bitmap write becomes pointless. This fact * simplifies the implementation of redundant writes: since we know the bitmap * cannot be updated by anyone else, we don't have to worry about transactions * or potential write conflicts. * */ static void schedule_redundant_bm_write(struct vhd_state *s, uint32_t blk) { uint64_t offset; struct vhd_request *req; ASSERT(s->vhd.footer.type != HD_TYPE_FIXED); ASSERT(test_batmap(s, blk)); req = alloc_vhd_request(s); if (!req) return; req->treq.buf = s->padbm_buf; offset = bat_entry(s, blk); ASSERT(offset != DD_BLK_UNUSED); offset <<= VHD_SECTOR_SHIFT; offset -= s->padbm_size - (s->bm_secs << VHD_SECTOR_SHIFT); req->op = VHD_OP_REDUNDANT_BM_WRITE; req->treq.sec = blk * s->spb; req->treq.secs = s->padbm_size >> VHD_SECTOR_SHIFT; req->next = NULL; DBG(TLOG_DBG, "blk: %u, writing redundant bitmap at %" PRIu64 "\n", blk, offset); aio_write(s, req, offset); } static int update_bat(struct vhd_state *s, uint32_t blk) { int err; uint64_t lb_end; struct vhd_bitmap *bm; ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED); if (bat_locked(s)) { ASSERT(s->bat.pbw_blk == blk); return 0; } /* empty bitmap could already be in * cache if earlier bat update failed */ bm = get_bitmap(s, blk); if (!bm) { /* install empty bitmap in cache */ err = alloc_vhd_bitmap(s, &bm, blk); if (err) return err; install_bitmap(s, bm); } lock_bat(s); lb_end = reserve_new_block(s, blk); schedule_zero_bm_write(s, bm, lb_end); set_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT); return 0; } static int allocate_block(struct vhd_state *s, uint32_t blk) { int err, gap; uint64_t offset, size; struct vhd_bitmap *bm; ssize_t count; ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED); if (bat_locked(s)) { ASSERT(s->bat.pbw_blk == blk); if (s->bat.req.error) return -EBUSY; return 0; } gap = 0; s->bat.pbw_blk = blk; offset = vhd_sectors_to_bytes(s->next_db); /* data region of segment should begin on page boundary */ if ((s->next_db + s->bm_secs) % s->spp) { gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp)); s->next_db += gap; } s->bat.pbw_offset = s->next_db; DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset); if (lseek(s->vhd.fd, offset, SEEK_SET) == (off_t)-1) { ERR(s, -errno, "lseek failed\n"); return -errno; } size = vhd_sectors_to_bytes(s->spb + s->bm_secs + gap); count = write(s->vhd.fd, vhd_zeros(size), size); if (count != size) { err = count < 0 ? -errno : -ENOSPC; ERR(s, -errno, "write failed (%zd, offset %"PRIu64")\n", count, offset); return err; } /* empty bitmap could already be in * cache if earlier bat update failed */ bm = get_bitmap(s, blk); if (!bm) { /* install empty bitmap in cache */ err = alloc_vhd_bitmap(s, &bm, blk); if (err) return err; install_bitmap(s, bm); } lock_bat(s); lock_bitmap(bm); schedule_bat_write(s); add_to_transaction(&bm->tx, &s->bat.req); return 0; } static int schedule_data_read(struct vhd_state *s, td_request_t treq, vhd_flag_t flags) { uint64_t offset; uint32_t blk = 0, sec = 0; struct vhd_bitmap *bm; struct vhd_request *req; if (s->vhd.footer.type == HD_TYPE_FIXED) { offset = vhd_sectors_to_bytes(treq.sec); goto make_request; } blk = treq.sec / s->spb; sec = treq.sec % s->spb; bm = get_bitmap(s, blk); offset = bat_entry(s, blk); ASSERT(offset != DD_BLK_UNUSED); ASSERT(test_batmap(s, blk) || (bm && bitmap_valid(bm))); offset += s->bm_secs + sec; offset = vhd_sectors_to_bytes(offset); make_request: req = alloc_vhd_request(s); if (!req) return -EBUSY; req->treq = treq; req->flags = flags; req->op = VHD_OP_DATA_READ; req->next = NULL; aio_read(s, req, offset); DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, " "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x, buf: %p\n", s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags, treq.buf); return 0; } static int schedule_data_write(struct vhd_state *s, td_request_t treq, vhd_flag_t flags) { int err; uint64_t offset; uint32_t blk = 0, sec = 0; struct vhd_bitmap *bm = NULL; struct vhd_request *req; if (s->vhd.footer.type == HD_TYPE_FIXED) { offset = vhd_sectors_to_bytes(treq.sec); goto make_request; } blk = treq.sec / s->spb; sec = treq.sec % s->spb; offset = bat_entry(s, blk); if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BAT)) { if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) err = allocate_block(s, blk); else err = update_bat(s, blk); if (err) return err; offset = s->bat.pbw_offset; } offset += s->bm_secs + sec; offset = vhd_sectors_to_bytes(offset); make_request: req = alloc_vhd_request(s); if (!req) return -EBUSY; req->treq = treq; req->flags = flags; req->op = VHD_OP_DATA_WRITE; req->next = NULL; if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BITMAP)) { bm = get_bitmap(s, blk); ASSERT(bm && bitmap_valid(bm)); lock_bitmap(bm); if (bm->tx.closed) { add_to_tail(&bm->queue, req); set_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED); } else add_to_transaction(&bm->tx, req); } else if (sec == 0 && /* first sector inside data block */ s->vhd.footer.type != HD_TYPE_FIXED && bat_entry(s, blk) != s->first_db && test_batmap(s, blk)) schedule_redundant_bm_write(s, blk); aio_write(s, req, offset); DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, " "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x\n", s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags); return 0; } static int schedule_bitmap_read(struct vhd_state *s, uint32_t blk) { int err; uint64_t offset; struct vhd_bitmap *bm; struct vhd_request *req = NULL; ASSERT(vhd_type_dynamic(&s->vhd)); offset = bat_entry(s, blk); ASSERT(offset != DD_BLK_UNUSED); ASSERT(!get_bitmap(s, blk)); offset = vhd_sectors_to_bytes(offset); err = alloc_vhd_bitmap(s, &bm, blk); if (err) return err; req = &bm->req; init_vhd_request(s, req); req->treq.sec = blk * s->spb; req->treq.secs = s->bm_secs; req->treq.buf = bm->map; req->treq.cb = NULL; req->op = VHD_OP_BITMAP_READ; req->next = NULL; aio_read(s, req, offset); lock_bitmap(bm); install_bitmap(s, bm); set_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING); DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, nr_secs: 0x%04x, " "offset: 0x%08"PRIx64"\n", s->vhd.file, req->treq.sec, blk, req->treq.secs, offset); return 0; } static void schedule_bitmap_write(struct vhd_state *s, uint32_t blk) { uint64_t offset; struct vhd_bitmap *bm; struct vhd_request *req; bm = get_bitmap(s, blk); offset = bat_entry(s, blk); ASSERT(vhd_type_dynamic(&s->vhd)); ASSERT(bm && bitmap_valid(bm) && !test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING)); if (offset == DD_BLK_UNUSED) { ASSERT(bat_locked(s) && s->bat.pbw_blk == blk); offset = s->bat.pbw_offset; } offset = vhd_sectors_to_bytes(offset); req = &bm->req; init_vhd_request(s, req); req->treq.sec = blk * s->spb; req->treq.secs = s->bm_secs; req->treq.buf = bm->shadow; req->treq.cb = NULL; req->op = VHD_OP_BITMAP_WRITE; req->next = NULL; aio_write(s, req, offset); lock_bitmap(bm); touch_bitmap(s, bm); /* bump lru count */ set_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING); DBG(TLOG_DBG, "%s: blk: 0x%04x, sec: 0x%08"PRIx64", nr_secs: 0x%04x, " "offset: 0x%"PRIx64"\n", s->vhd.file, blk, req->treq.sec, req->treq.secs, offset); } /* * queued requests will be submitted once the bitmap * describing them is read and the requests are validated. */ static int __vhd_queue_request(struct vhd_state *s, uint8_t op, td_request_t treq) { uint32_t blk; struct vhd_bitmap *bm; struct vhd_request *req; ASSERT(vhd_type_dynamic(&s->vhd)); blk = treq.sec / s->spb; bm = get_bitmap(s, blk); ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)); req = alloc_vhd_request(s); if (!req) return -EBUSY; req->treq = treq; req->op = op; req->next = NULL; add_to_tail(&bm->waiting, req); lock_bitmap(bm); DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x nr_secs: 0x%04x, " "op: %u\n", s->vhd.file, treq.sec, blk, treq.secs, op); TRACE(s); return 0; } static void vhd_queue_read(td_driver_t *driver, td_request_t treq) { struct vhd_state *s = (struct vhd_state *)driver->data; DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x (seg: %d)\n", s->vhd.file, treq.sec, treq.secs, treq.sidx); while (treq.secs) { int err; td_request_t clone; err = 0; clone = treq; switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_READ)) { case -EINVAL: err = -EINVAL; goto fail; case VHD_BM_BAT_CLEAR: clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); td_forward_request(clone); break; case VHD_BM_BIT_CLEAR: clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0); td_forward_request(clone); break; case VHD_BM_BIT_SET: clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1); err = schedule_data_read(s, clone, 0); if (err) goto fail; break; case VHD_BM_NOT_CACHED: err = schedule_bitmap_read(s, clone.sec / s->spb); if (err) goto fail; clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone); if (err) goto fail; break; case VHD_BM_READ_PENDING: clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone); if (err) goto fail; break; case VHD_BM_BAT_LOCKED: default: ASSERT(0); break; } treq.sec += clone.secs; treq.secs -= clone.secs; treq.buf += vhd_sectors_to_bytes(clone.secs); continue; fail: clone.secs = treq.secs; td_complete_request(clone, err); break; } } static void vhd_queue_write(td_driver_t *driver, td_request_t treq) { struct vhd_state *s = (struct vhd_state *)driver->data; DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x, (seg: %d)\n", s->vhd.file, treq.sec, treq.secs, treq.sidx); while (treq.secs) { int err; uint8_t flags; td_request_t clone; err = 0; flags = 0; clone = treq; switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_WRITE)) { case -EINVAL: err = -EINVAL; goto fail; case VHD_BM_BAT_LOCKED: err = -EBUSY; goto fail; case VHD_BM_BAT_CLEAR: flags = (VHD_FLAG_REQ_UPDATE_BAT | VHD_FLAG_REQ_UPDATE_BITMAP); clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); err = schedule_data_write(s, clone, flags); if (err) goto fail; break; case VHD_BM_BIT_CLEAR: flags = VHD_FLAG_REQ_UPDATE_BITMAP; clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0); err = schedule_data_write(s, clone, flags); if (err) goto fail; break; case VHD_BM_BIT_SET: clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1); err = schedule_data_write(s, clone, 0); if (err) goto fail; break; case VHD_BM_NOT_CACHED: clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); err = schedule_bitmap_read(s, clone.sec / s->spb); if (err) goto fail; err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone); if (err) goto fail; break; case VHD_BM_READ_PENDING: clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb)); err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone); if (err) goto fail; break; default: ASSERT(0); break; } treq.sec += clone.secs; treq.secs -= clone.secs; treq.buf += vhd_sectors_to_bytes(clone.secs); continue; fail: clone.secs = treq.secs; td_complete_request(clone, err); break; } } static inline void signal_completion(struct vhd_request *list, int error) { struct vhd_state *s; struct vhd_request *r, *next; if (!list) return; r = list; s = list->state; while (r) { int err; err = (error ? error : r->error); next = r->next; td_complete_request(r->treq, err); DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64", " "err: %d\n", r->treq.sec, r->treq.sec / s->spb, err); free_vhd_request(s, r); r = next; s->returned++; TRACE(s); } } static void start_new_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm) { struct vhd_transaction *tx; struct vhd_request *r, *next; int i; if (!bm->queue.head) return; DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk); r = bm->queue.head; tx = &bm->tx; clear_req_list(&bm->queue); if (r && bat_entry(s, bm->blk) == DD_BLK_UNUSED) tx->error = -EIO; while (r) { next = r->next; r->next = NULL; clear_vhd_flag(r->flags, VHD_FLAG_REQ_QUEUED); add_to_transaction(tx, r); if (test_vhd_flag(r->flags, VHD_FLAG_REQ_FINISHED)) { tx->finished++; if (!r->error) { uint32_t sec = r->treq.sec % s->spb; for (i = 0; i < r->treq.secs; i++) vhd_bitmap_set(&s->vhd, bm->shadow, sec + i); } } r = next; } /* perhaps all the queued writes already completed? */ if (tx->started && transaction_completed(tx)) finish_data_transaction(s, bm); } static void finish_bat_transaction(struct vhd_state *s, struct vhd_bitmap *bm) { struct vhd_transaction *tx = &bm->tx; if (!bat_locked(s)) return; if (s->bat.pbw_blk != bm->blk) return; if (!s->bat.req.error) goto release; if (!test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE)) goto release; tx->closed = 1; return; release: DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk); unlock_bat(s); init_bat(s); } static void finish_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm, int error) { int map_size; struct vhd_transaction *tx = &bm->tx; DBG(TLOG_DBG, "blk: 0x%04x, err: %d\n", bm->blk, error); tx->error = (tx->error ? tx->error : error); map_size = vhd_sectors_to_bytes(s->bm_secs); if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) { if (test_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT)) { /* still waiting for bat write */ ASSERT(bm->blk == s->bat.pbw_blk); ASSERT(test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED)); s->bat.req.tx = tx; return; } } if (tx->error) { /* undo changes to shadow */ memcpy(bm->shadow, bm->map, map_size); } else { /* complete atomic write */ memcpy(bm->map, bm->shadow, map_size); if (!test_batmap(s, bm->blk) && bitmap_full(s, bm)) set_batmap(s, bm->blk); } /* transaction done; signal completions */ signal_completion(tx->requests.head, tx->error); init_tx(tx); start_new_bitmap_transaction(s, bm); if (!bitmap_in_use(bm)) unlock_bitmap(bm); finish_bat_transaction(s, bm); } static void finish_data_transaction(struct vhd_state *s, struct vhd_bitmap *bm) { struct vhd_transaction *tx = &bm->tx; DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk); tx->closed = 1; if (!tx->error) return schedule_bitmap_write(s, bm->blk); return finish_bitmap_transaction(s, bm, 0); } static void finish_bat_write(struct vhd_request *req) { struct vhd_bitmap *bm; struct vhd_transaction *tx; struct vhd_state *s = req->state; s->returned++; TRACE(s); bm = get_bitmap(s, s->bat.pbw_blk); DBG(TLOG_DBG, "blk 0x%04x, pbwo: 0x%08"PRIx64", err %d\n", s->bat.pbw_blk, s->bat.pbw_offset, req->error); ASSERT(bm && bitmap_valid(bm)); ASSERT(bat_locked(s) && test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED)); tx = &bm->tx; ASSERT(test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE)); if (!req->error) { bat_entry(s, s->bat.pbw_blk) = s->bat.pbw_offset; s->next_db = s->bat.pbw_offset + s->spb + s->bm_secs; } else tx->error = req->error; if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) { tx->finished++; remove_from_req_list(&tx->requests, req); if (transaction_completed(tx)) finish_data_transaction(s, bm); } else { clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT); if (s->bat.req.tx) finish_bitmap_transaction(s, bm, req->error); } finish_bat_transaction(s, bm); } static void finish_zero_bm_write(struct vhd_request *req) { uint32_t blk; struct vhd_bitmap *bm; struct vhd_transaction *tx = req->tx; struct vhd_state *s = req->state; s->returned++; TRACE(s); blk = req->treq.sec / s->spb; bm = get_bitmap(s, blk); DBG(TLOG_DBG, "blk: 0x%04x\n", blk); ASSERT(bat_locked(s)); ASSERT(s->bat.pbw_blk == blk); ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm)); tx->finished++; remove_from_req_list(&tx->requests, req); if (req->error) { unlock_bat(s); init_bat(s); tx->error = req->error; clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT); } else schedule_bat_write(s); if (transaction_completed(tx)) finish_data_transaction(s, bm); } static int finish_redundant_bm_write(struct vhd_request *req) { /* uint32_t blk; */ struct vhd_state *s = (struct vhd_state *) req->state; s->returned++; TRACE(s); /* blk = req->treq.sec / s->spb; DBG(TLOG_DBG, "blk: %u\n", blk); */ if (req->error) { ERR(s, req->error, "lsec: 0x%08"PRIx64, req->treq.sec); } free_vhd_request(s, req); s->debug_done_redundant_writes++; return 0; } static void finish_bitmap_read(struct vhd_request *req) { uint32_t blk; struct vhd_bitmap *bm; struct vhd_request *r, *next; struct vhd_state *s = req->state; s->returned++; TRACE(s); blk = req->treq.sec / s->spb; bm = get_bitmap(s, blk); DBG(TLOG_DBG, "blk: 0x%04x\n", blk); ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)); r = bm->waiting.head; clear_req_list(&bm->waiting); clear_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING); if (!req->error) { memcpy(bm->shadow, bm->map, vhd_sectors_to_bytes(s->bm_secs)); while (r) { struct vhd_request tmp; tmp = *r; next = r->next; free_vhd_request(s, r); ASSERT(tmp.op == VHD_OP_DATA_READ || tmp.op == VHD_OP_DATA_WRITE); if (tmp.op == VHD_OP_DATA_READ) vhd_queue_read(s->driver, tmp.treq); else if (tmp.op == VHD_OP_DATA_WRITE) vhd_queue_write(s->driver, tmp.treq); r = next; } } else { int err = req->error; unlock_bitmap(bm); free_vhd_bitmap(s, bm); return signal_completion(r, err); } if (!bitmap_in_use(bm)) unlock_bitmap(bm); } static void finish_bitmap_write(struct vhd_request *req) { uint32_t blk; struct vhd_bitmap *bm; struct vhd_transaction *tx; struct vhd_state *s = req->state; s->returned++; TRACE(s); blk = req->treq.sec / s->spb; bm = get_bitmap(s, blk); tx = &bm->tx; DBG(TLOG_DBG, "blk: 0x%04x, started: %d, finished: %d\n", blk, tx->started, tx->finished); ASSERT(tx->closed); ASSERT(bm && bitmap_valid(bm)); ASSERT(test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING)); clear_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING); finish_bitmap_transaction(s, bm, req->error); } static void finish_data_read(struct vhd_request *req) { struct vhd_state *s = req->state; DBG(TLOG_DBG, "lsec 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", req->treq.sec, req->treq.sec / s->spb); signal_completion(req, 0); } static void finish_data_write(struct vhd_request *req) { int i; struct vhd_transaction *tx = req->tx; struct vhd_state *s = (struct vhd_state *)req->state; set_vhd_flag(req->flags, VHD_FLAG_REQ_FINISHED); if (tx) { uint32_t blk, sec; struct vhd_bitmap *bm; blk = req->treq.sec / s->spb; sec = req->treq.sec % s->spb; bm = get_bitmap(s, blk); ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm)); tx->finished++; DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x04%"PRIx64", " "tx->started: %d, tx->finished: %d\n", req->treq.sec, req->treq.sec / s->spb, tx->started, tx->finished); if (!req->error) for (i = 0; i < req->treq.secs; i++) vhd_bitmap_set(&s->vhd, bm->shadow, sec + i); if (transaction_completed(tx)) finish_data_transaction(s, bm); } else if (!test_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED)) { ASSERT(!req->next); DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64"\n", req->treq.sec, req->treq.sec / s->spb); signal_completion(req, 0); } } void vhd_complete(void *arg, struct tiocb *tiocb, int err) { struct vhd_request *req = (struct vhd_request *)arg; struct vhd_state *s = req->state; struct iocb *io = &tiocb->iocb; s->completed++; TRACE(s); req->error = err; if (req->error) ERR(s, req->error, "%s: op: %u, lsec: %"PRIu64", secs: %u, " "nbytes: %lu, blk: %"PRIu64", blk_offset: %u", s->vhd.file, req->op, req->treq.sec, req->treq.secs, io->u.c.nbytes, req->treq.sec / s->spb, bat_entry(s, req->treq.sec / s->spb)); switch (req->op) { case VHD_OP_DATA_READ: finish_data_read(req); break; case VHD_OP_DATA_WRITE: finish_data_write(req); break; case VHD_OP_BITMAP_READ: finish_bitmap_read(req); break; case VHD_OP_BITMAP_WRITE: finish_bitmap_write(req); break; case VHD_OP_ZERO_BM_WRITE: finish_zero_bm_write(req); break; case VHD_OP_REDUNDANT_BM_WRITE: finish_redundant_bm_write(req); break; case VHD_OP_BAT_WRITE: finish_bat_write(req); break; default: ASSERT(0); break; } } void vhd_debug(td_driver_t *driver) { int i; struct vhd_state *s = (struct vhd_state *)driver->data; DBG(TLOG_WARN, "%s: QUEUED: 0x%08"PRIx64", COMPLETED: 0x%08"PRIx64", " "RETURNED: 0x%08"PRIx64"\n", s->vhd.file, s->queued, s->completed, s->returned); DBG(TLOG_WARN, "WRITES: 0x%08"PRIx64", AVG_WRITE_SIZE: %f\n", s->writes, (s->writes ? ((float)s->write_size / s->writes) : 0.0)); DBG(TLOG_WARN, "READS: 0x%08"PRIx64", AVG_READ_SIZE: %f\n", s->reads, (s->reads ? ((float)s->read_size / s->reads) : 0.0)); DBG(TLOG_WARN, "ALLOCATED REQUESTS: (%u total)\n", VHD_REQS_DATA); for (i = 0; i < VHD_REQS_DATA; i++) { struct vhd_request *r = &s->vreq_list[i]; td_request_t *t = &r->treq; const char *vname = t->vreq ? t->vreq->name: NULL; if (t->secs) DBG(TLOG_WARN, "%d: vreq: %s.%d, err: %d, op: %d," " lsec: 0x%08"PRIx64", flags: %d, this: %p, " "next: %p, tx: %p\n", i, vname, t->sidx, r->error, r->op, t->sec, r->flags, r, r->next, r->tx); } DBG(TLOG_WARN, "BITMAP CACHE:\n"); for (i = 0; i < VHD_CACHE_SIZE; i++) { int qnum = 0, wnum = 0, rnum = 0; struct vhd_bitmap *bm = s->bitmap[i]; struct vhd_transaction *tx; struct vhd_request *r; if (!bm) continue; tx = &bm->tx; r = bm->queue.head; while (r) { qnum++; r = r->next; } r = bm->waiting.head; while (r) { wnum++; r = r->next; } r = tx->requests.head; while (r) { rnum++; r = r->next; } DBG(TLOG_WARN, "%d: blk: 0x%04x, status: 0x%08x, q: %p, qnum: %d, w: %p, " "wnum: %d, locked: %d, in use: %d, tx: %p, tx_error: %d, " "started: %d, finished: %d, status: %u, reqs: %p, nreqs: %d\n", i, bm->blk, bm->status, bm->queue.head, qnum, bm->waiting.head, wnum, bitmap_locked(bm), bitmap_in_use(bm), tx, tx->error, tx->started, tx->finished, tx->status, tx->requests.head, rnum); } DBG(TLOG_WARN, "BAT: status: 0x%08x, pbw_blk: 0x%04x, " "pbw_off: 0x%08"PRIx64", tx: %p\n", s->bat.status, s->bat.pbw_blk, s->bat.pbw_offset, s->bat.req.tx); /* for (i = 0; i < s->hdr.max_bat_size; i++) DPRINTF("%d: %u\n", i, s->bat.bat[i]); */ } struct tap_disk tapdisk_vhd = { .disk_type = "tapdisk_vhd", .flags = 0, .private_data_size = sizeof(struct vhd_state), .td_open = _vhd_open, .td_close = _vhd_close, .td_queue_read = vhd_queue_read, .td_queue_write = vhd_queue_write, .td_get_parent_id = vhd_get_parent_id, .td_validate_parent = vhd_validate_parent, .td_debug = vhd_debug, }; blktap-2.0.90/drivers/tapdisk-disktype.c0000644000000000000000000001351711664745551016715 0ustar rootroot/* * Copyright (c) 2007, 2010, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include "tapdisk-disktype.h" static const disk_info_t aio_disk = { "aio", "raw image (aio)", 0, }; static const disk_info_t sync_disk = { "sync", "raw image (sync)", 0, }; static const disk_info_t vmdk_disk = { "vmdk", "vmware image (vmdk)", 1, }; static const disk_info_t vhdsync_disk = { "vhdsync", "virtual server image (vhd) - synchronous", 1, }; static const disk_info_t vhd_disk = { "vhd", "virtual server image (vhd)", 0, }; static const disk_info_t ram_disk = { "ram", "ramdisk image (ram)", 1, }; static const disk_info_t qcow_disk = { "qcow", "qcow disk (qcow)", 0, }; static const disk_info_t block_cache_disk = { "bc", "block cache image (bc)", 1, }; static const disk_info_t vhd_index_disk = { "vhdi", "vhd index image (vhdi)", 1, }; static const disk_info_t log_disk = { "log", "write logger (log)", DISK_TYPE_FILTER, }; static disk_info_t remus_disk = { "remus disk replicator (remus)", "remus", 0, }; static const disk_info_t lcache_disk = { "lc", "local parent cache (lc)", DISK_TYPE_FILTER, }; static const disk_info_t llpcache_disk = { "llp", "local leaf cache, persistent (llp)", 0, }; static const disk_info_t llecache_disk = { "lle", "local leaf cache, ephemeral (lle)", 0, }; static const disk_info_t valve_disk = { "valve", "group rate limiting (valve)", DISK_TYPE_FILTER, }; const disk_info_t *tapdisk_disk_types[] = { [DISK_TYPE_AIO] = &aio_disk, [DISK_TYPE_SYNC] = &sync_disk, [DISK_TYPE_VMDK] = &vmdk_disk, [DISK_TYPE_VHDSYNC] = &vhdsync_disk, [DISK_TYPE_VHD] = &vhd_disk, [DISK_TYPE_RAM] = &ram_disk, [DISK_TYPE_QCOW] = &qcow_disk, [DISK_TYPE_BLOCK_CACHE] = &block_cache_disk, [DISK_TYPE_VINDEX] = &vhd_index_disk, [DISK_TYPE_LOG] = &log_disk, [DISK_TYPE_REMUS] = &remus_disk, [DISK_TYPE_LCACHE] = &lcache_disk, [DISK_TYPE_VALVE] = &valve_disk, [DISK_TYPE_LLPCACHE] = &llpcache_disk, [DISK_TYPE_LLECACHE] = &llecache_disk, 0, }; extern struct tap_disk tapdisk_aio; #if 0 extern struct tap_disk tapdisk_sync; extern struct tap_disk tapdisk_vmdk; extern struct tap_disk tapdisk_vhdsync; #endif extern struct tap_disk tapdisk_vhd; extern struct tap_disk tapdisk_ram; #if 0 extern struct tap_disk tapdisk_qcow; #endif extern struct tap_disk tapdisk_block_cache; extern struct tap_disk tapdisk_vhd_index; #if 0 extern struct tap_disk tapdisk_log; #endif extern struct tap_disk tapdisk_lcache; extern struct tap_disk tapdisk_llpcache; extern struct tap_disk tapdisk_llecache; extern struct tap_disk tapdisk_valve; const struct tap_disk *tapdisk_disk_drivers[] = { [DISK_TYPE_AIO] = &tapdisk_aio, #if 0 [DISK_TYPE_SYNC] = &tapdisk_sync, [DISK_TYPE_VMDK] = &tapdisk_vmdk, [DISK_TYPE_VHDSYNC] = &tapdisk_vhdsync_disk #endif [DISK_TYPE_VHD] = &tapdisk_vhd, [DISK_TYPE_RAM] = &tapdisk_ram, #if 0 [DISK_TYPE_QCOW] = &tapdisk_qcow, #endif [DISK_TYPE_BLOCK_CACHE] = &tapdisk_block_cache, [DISK_TYPE_VINDEX] = &tapdisk_vhd_index, #if 0 [DISK_TYPE_LOG] = &tapdisk_log, #endif [DISK_TYPE_LCACHE] = &tapdisk_lcache, [DISK_TYPE_LLPCACHE] = &tapdisk_llpcache, [DISK_TYPE_LLECACHE] = &tapdisk_llecache, [DISK_TYPE_VALVE] = &tapdisk_valve, 0, }; #define ARRAY_SIZE(_a) (sizeof(_a)/sizeof((_a)[0])) int tapdisk_disktype_find(const char *name) { int i; for (i = 0; i < ARRAY_SIZE(tapdisk_disk_types); i++) { const disk_info_t *info = tapdisk_disk_types[i]; if (!info) continue; if (strcmp(name, info->name)) continue; if (!tapdisk_disk_drivers[i]) return -ENOSYS; return i; } return -ENOENT; } int tapdisk_disktype_parse_params(const char *params, const char **_path) { char name[DISK_TYPE_NAME_MAX], *ptr; size_t len; int type; ptr = strchr(params, ':'); if (!ptr) return -EINVAL; len = ptr - params; if (len > sizeof(name) - 1) return -ENAMETOOLONG; memset(name, 0, sizeof(name)); strncpy(name, params, len); type = tapdisk_disktype_find(name); *_path = params + len + 1; return type; } blktap-2.0.90/drivers/block-lcache.c0000644000000000000000000002117611664745551015733 0ustar rootroot/* * Copyright (c) 2010, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Local persistent cache: write any sectors not found in the leaf back to the * leaf. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include "vhd.h" #include "tapdisk.h" #include "tapdisk-utils.h" #include "tapdisk-driver.h" #include "tapdisk-server.h" #include "tapdisk-interface.h" #define DEBUG 1 #ifdef DEBUG #define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a) #else #define DBG(_f, _a...) ((void)0) #endif #define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f "in %s:%d", \ ##_a, __func__, __LINE__) #define INFO(_f, _a...) tlog_syslog(TLOG_INFO, _f, ##_a) #define BUG() td_panic() #define BUG_ON(_cond) if (unlikely(_cond)) { td_panic(); } #define WARN_ON(_p) if (unlikely(_cond)) { WARN(_cond); } #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define TD_LCACHE_MAX_REQ (MAX_REQUESTS*2) #define TD_LCACHE_BUFSZ (MAX_SEGMENTS_PER_REQ * \ sysconf(_SC_PAGE_SIZE)) typedef struct lcache td_lcache_t; typedef struct lcache_request td_lcache_req_t; struct lcache_request { char *buf; int err; td_request_t treq; int secs; td_vbd_request_t vreq; struct td_iovec iov; td_lcache_t *cache; }; struct lcache { char *name; td_lcache_req_t reqv[TD_LCACHE_MAX_REQ]; td_lcache_req_t *free[TD_LCACHE_MAX_REQ]; int n_free; char *buf; size_t bufsz; int wr_en; struct timeval ts; }; static td_lcache_req_t * lcache_alloc_request(td_lcache_t *cache) { td_lcache_req_t *req = NULL; if (likely(cache->n_free)) req = cache->free[--cache->n_free]; return req; } static void lcache_free_request(td_lcache_t *cache, td_lcache_req_t *req) { BUG_ON(cache->n_free >= TD_LCACHE_MAX_REQ); cache->free[cache->n_free++] = req; } static void lcache_destroy_buffers(td_lcache_t *cache) { td_lcache_req_t *req; do { req = lcache_alloc_request(cache); if (req) munmap(req->buf, TD_LCACHE_BUFSZ); } while (req); } static int lcache_create_buffers(td_lcache_t *cache) { int prot, flags, i, err; prot = PROT_READ|PROT_WRITE; flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_LOCKED; cache->n_free = 0; for (i = 0; i < TD_LCACHE_MAX_REQ; i++) { td_lcache_req_t *req = &cache->reqv[i]; req->buf = mmap(NULL, TD_LCACHE_BUFSZ, prot, flags, -1, 0); if (req->buf == MAP_FAILED) { req->buf = NULL; err = -errno; goto fail; } lcache_free_request(cache, req); } return 0; fail: EPRINTF("Buffer init failure: %d", err); lcache_destroy_buffers(cache); return err; } static int lcache_close(td_driver_t *driver) { td_lcache_t *cache = driver->data; lcache_destroy_buffers(cache); free(cache->name); return 0; } static int lcache_open(td_driver_t *driver, const char *name, td_flag_t flags) { td_lcache_t *cache = driver->data; int err; err = tapdisk_namedup(&cache->name, (char *)name); if (err) goto fail; err = lcache_create_buffers(cache); if (err) goto fail; timerclear(&cache->ts); cache->wr_en = 1; return 0; fail: lcache_close(driver); return err; } /* * NB. lcache->{wr_en,ts}: test free space in the caching SR before * attempting to store our reads. VHD block allocation writes on Ext3 * have the nasty property of blocking excessively after running out * of space. We therefore enable/disable ourselves at a 1/s * granularity, querying free space through statfs beforehand. */ static long lcache_fs_bfree(const td_lcache_t *cache, long *bsize) { struct statfs fst; int err; err = statfs(cache->name, &fst); if (err) return err; if (likely(bsize)) *bsize = fst.f_bsize; return MIN(fst.f_bfree, LONG_MAX); } static int __lcache_wr_enabled(const td_lcache_t *cache) { long threshold = 2<<20; /* B */ long bfree, bsz = 1; int enable; bfree = lcache_fs_bfree(cache, &bsz); enable = bfree > threshold / bsz; return enable; } static int lcache_wr_enabled(td_lcache_t *cache) { const int timeout = 1; /* s */ struct timeval now, delta; gettimeofday(&now, NULL); timersub(&now, &cache->ts, &delta); if (delta.tv_sec >= timeout) { cache->wr_en = __lcache_wr_enabled(cache); cache->ts = now; } return cache->wr_en; } static void __lcache_write_cb(td_vbd_request_t *vreq, int error, void *token, int final) { td_lcache_req_t *req = containerof(vreq, td_lcache_req_t, vreq); td_lcache_t *cache = token; if (error == -ENOSPC) cache->wr_en = 0; lcache_free_request(cache, req); } static void lcache_store_read(td_lcache_t *cache, td_lcache_req_t *req) { td_vbd_request_t *vreq; struct td_iovec *iov; td_vbd_t *vbd; int err; iov = &req->iov; iov->base = req->buf; iov->secs = req->treq.secs; vreq = &req->vreq; vreq->op = TD_OP_WRITE; vreq->sec = req->treq.sec; vreq->iov = iov; vreq->iovcnt = 1; vreq->cb = __lcache_write_cb; vreq->token = cache; vbd = req->treq.vreq->vbd; err = tapdisk_vbd_queue_request(vbd, vreq); BUG_ON(err); } static void lcache_complete_read(td_lcache_t *cache, td_lcache_req_t *req) { if (likely(!req->err)) { size_t sz = req->treq.secs << SECTOR_SHIFT; memcpy(req->treq.buf, req->buf, sz); } td_complete_request(req->treq, req->err); if (unlikely(req->err) || !lcache_wr_enabled(cache)) { lcache_free_request(cache, req); return; } lcache_store_read(cache, req); } static void __lcache_read_cb(td_request_t treq, int err) { td_lcache_req_t *req = treq.cb_data; td_lcache_t *cache = req->cache; BUG_ON(req->secs < treq.secs); req->secs -= treq.secs; req->err = req->err ? : err; if (!req->secs) lcache_complete_read(cache, req); } static void lcache_queue_read(td_driver_t *driver, td_request_t treq) { td_lcache_t *cache = driver->data; td_request_t clone; td_lcache_req_t *req; req = lcache_alloc_request(cache); if (!req) { td_complete_request(treq, -EBUSY); return; } req->treq = treq; req->cache = cache; req->secs = req->treq.secs; req->err = 0; clone = treq; clone.buf = req->buf; clone.cb = __lcache_read_cb; clone.cb_data = req; td_forward_request(clone); } static int lcache_get_parent_id(td_driver_t *driver, td_disk_id_t *id) { return -EINVAL; } static int lcache_validate_parent(td_driver_t *driver, td_driver_t *pdriver, td_flag_t flags) { if (strcmp(driver->name, pdriver->name)) return -EINVAL; return 0; } struct tap_disk tapdisk_lcache = { .disk_type = "tapdisk_lcache", .flags = 0, .private_data_size = sizeof(td_lcache_t), .td_open = lcache_open, .td_close = lcache_close, .td_queue_read = lcache_queue_read, .td_get_parent_id = lcache_get_parent_id, .td_validate_parent = lcache_validate_parent, }; blktap-2.0.90/drivers/tapdisk-control.h0000644000000000000000000000331111664745551016535 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __TAPDISK_CONTROL_H__ #define __TAPDISK_CONTROL_H__ int tapdisk_control_open(char **path); void tapdisk_control_close(void); #endif blktap-2.0.90/drivers/tapdisk-server.h0000644000000000000000000000523011664745551016365 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TAPDISK_SERVER_H_ #define _TAPDISK_SERVER_H_ #include "list.h" #include "tapdisk-vbd.h" #include "tapdisk-queue.h" struct tap_disk *tapdisk_server_find_driver_interface(int); td_image_t *tapdisk_server_get_shared_image(td_image_t *); struct list_head *tapdisk_server_get_all_vbds(void); td_vbd_t *tapdisk_server_get_vbd(td_uuid_t); void tapdisk_server_add_vbd(td_vbd_t *); void tapdisk_server_remove_vbd(td_vbd_t *); void tapdisk_server_queue_tiocb(struct tiocb *); void tapdisk_server_check_state(void); event_id_t tapdisk_server_register_event(char, int, int, event_cb_t, void *); void tapdisk_server_unregister_event(event_id_t); void tapdisk_server_mask_event(event_id_t, int); void tapdisk_server_set_max_timeout(int); int tapdisk_server_init(void); int tapdisk_server_initialize(const char *, const char *); int tapdisk_server_complete(void); int tapdisk_server_run(void); void tapdisk_server_iterate(void); int tapdisk_server_openlog(const char *, int, int); void tapdisk_server_closelog(void); void tapdisk_start_logging(const char *, const char *); void tapdisk_stop_logging(void); #endif blktap-2.0.90/drivers/tapdisk-vbd.h0000644000000000000000000001421311664745551015633 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TAPDISK_VBD_H_ #define _TAPDISK_VBD_H_ #include #include "tapdisk.h" #include "scheduler.h" #include "tapdisk-image.h" #include "tapdisk-blktap.h" #define TD_VBD_REQUEST_TIMEOUT 120 #define TD_VBD_MAX_RETRIES 100 #define TD_VBD_RETRY_INTERVAL 1 #define TD_VBD_DEAD 0x0001 #define TD_VBD_CLOSED 0x0002 #define TD_VBD_QUIESCE_REQUESTED 0x0004 #define TD_VBD_QUIESCED 0x0008 #define TD_VBD_PAUSE_REQUESTED 0x0010 #define TD_VBD_PAUSED 0x0020 #define TD_VBD_SHUTDOWN_REQUESTED 0x0040 #define TD_VBD_LOCKING 0x0080 #define TD_VBD_LOG_DROPPED 0x0100 #define TD_VBD_SECONDARY_DISABLED 0 #define TD_VBD_SECONDARY_MIRROR 1 #define TD_VBD_SECONDARY_STANDBY 2 struct td_vbd_handle { char *name; td_blktap_t *tap; td_uuid_t uuid; td_flag_t flags; td_flag_t state; struct list_head images; int parent_devnum; char *secondary_name; td_image_t *secondary; uint8_t secondary_mode; int FIXME_enospc_redirect_count_enabled; uint64_t FIXME_enospc_redirect_count; /* when we encounter ENOSPC on the primary leaf image in mirror mode, * we need to remove it from the VBD chain so that writes start going * on the secondary leaf. However, we cannot free the image at that * time since it might still have in-flight treqs referencing it. * Therefore, we move it into 'retired' until shutdown. */ td_image_t *retired; struct list_head new_requests; struct list_head pending_requests; struct list_head failed_requests; struct list_head completed_requests; td_vbd_request_t request_list[MAX_REQUESTS]; /* XXX */ struct list_head next; struct timeval ts; uint64_t received; uint64_t returned; uint64_t kicked; uint64_t secs_pending; uint64_t retries; uint64_t errors; td_sector_count_t secs; }; #define tapdisk_vbd_for_each_request(vreq, tmp, list) \ list_for_each_entry_safe((vreq), (tmp), (list), next) #define tapdisk_vbd_for_each_image(vbd, image, tmp) \ tapdisk_for_each_image_safe(image, tmp, &vbd->images) static inline void tapdisk_vbd_move_request(td_vbd_request_t *vreq, struct list_head *dest) { list_del(&vreq->next); INIT_LIST_HEAD(&vreq->next); list_add_tail(&vreq->next, dest); vreq->list_head = dest; } static inline void tapdisk_vbd_add_image(td_vbd_t *vbd, td_image_t *image) { list_add_tail(&image->next, &vbd->images); } static inline int tapdisk_vbd_is_last_image(td_vbd_t *vbd, td_image_t *image) { return list_is_last(&image->next, &vbd->images); } static inline td_image_t * tapdisk_vbd_first_image(td_vbd_t *vbd) { td_image_t *image = NULL; if (!list_empty(&vbd->images)) image = list_entry(vbd->images.next, td_image_t, next); return image; } static inline td_image_t * tapdisk_vbd_last_image(td_vbd_t *vbd) { td_image_t *image = NULL; if (!list_empty(&vbd->images)) image = list_entry(vbd->images.prev, td_image_t, next); return image; } static inline td_image_t * tapdisk_vbd_next_image(td_image_t *image) { return list_entry(image->next.next, td_image_t, next); } td_vbd_t *tapdisk_vbd_create(td_uuid_t); int tapdisk_vbd_initialize(int, int, td_uuid_t); int tapdisk_vbd_open(td_vbd_t *, const char *, int, const char *, td_flag_t); int tapdisk_vbd_close(td_vbd_t *); int tapdisk_vbd_open_vdi(td_vbd_t *, const char *, td_flag_t, int); void tapdisk_vbd_close_vdi(td_vbd_t *); int tapdisk_vbd_attach(td_vbd_t *, const char *, int); void tapdisk_vbd_detach(td_vbd_t *); int tapdisk_vbd_queue_request(td_vbd_t *, td_vbd_request_t *); void tapdisk_vbd_forward_request(td_request_t); int tapdisk_vbd_get_disk_info(td_vbd_t *, td_disk_info_t *); int tapdisk_vbd_retry_needed(td_vbd_t *); int tapdisk_vbd_quiesce_queue(td_vbd_t *); int tapdisk_vbd_start_queue(td_vbd_t *); int tapdisk_vbd_issue_requests(td_vbd_t *); int tapdisk_vbd_kill_queue(td_vbd_t *); int tapdisk_vbd_pause(td_vbd_t *); int tapdisk_vbd_resume(td_vbd_t *, const char *); void tapdisk_vbd_kick(td_vbd_t *); void tapdisk_vbd_check_state(td_vbd_t *); int tapdisk_vbd_recheck_state(td_vbd_t *); void tapdisk_vbd_check_progress(td_vbd_t *); void tapdisk_vbd_debug(td_vbd_t *); void tapdisk_vbd_stats(td_vbd_t *, td_stats_t *); #endif blktap-2.0.90/drivers/tapdisk-queue.c0000644000000000000000000003617011664745551016205 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #ifdef __linux__ #include #endif #include "tapdisk.h" #include "tapdisk-log.h" #include "tapdisk-queue.h" #include "tapdisk-filter.h" #include "tapdisk-server.h" #include "tapdisk-utils.h" #include "libaio-compat.h" #include "atomicio.h" #define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a) #define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a) #define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) /* * We used a kernel patch to return an fd associated with the AIO context * so that we can concurrently poll on synchronous and async descriptors. * This is signalled by passing 1 as the io context to io_setup. */ #define REQUEST_ASYNC_FD ((io_context_t)1) static inline void queue_tiocb(struct tqueue *queue, struct tiocb *tiocb) { struct iocb *iocb = &tiocb->iocb; if (queue->queued) { struct tiocb *prev = (struct tiocb *) queue->iocbs[queue->queued - 1]->data; prev->next = tiocb; } queue->iocbs[queue->queued++] = iocb; } static inline int deferred_tiocbs(struct tqueue *queue) { return (queue->deferred.head != NULL); } static inline void defer_tiocb(struct tqueue *queue, struct tiocb *tiocb) { struct tlist *list = &queue->deferred; if (!list->head) list->head = list->tail = tiocb; else list->tail = list->tail->next = tiocb; queue->tiocbs_deferred++; queue->deferrals++; } static inline void queue_deferred_tiocb(struct tqueue *queue) { struct tlist *list = &queue->deferred; if (list->head) { struct tiocb *tiocb = list->head; list->head = tiocb->next; if (!list->head) list->tail = NULL; queue_tiocb(queue, tiocb); queue->tiocbs_deferred--; } } static inline void queue_deferred_tiocbs(struct tqueue *queue) { while (!tapdisk_queue_full(queue) && deferred_tiocbs(queue)) queue_deferred_tiocb(queue); } /* * td_complete may queue more tiocbs */ static void complete_tiocb(struct tqueue *queue, struct tiocb *tiocb, unsigned long res) { int err; struct iocb *iocb = &tiocb->iocb; if (res == iocb->u.c.nbytes) err = 0; else if ((int)res < 0) err = (int)res; else err = -EIO; tiocb->cb(tiocb->arg, tiocb, err); } static int cancel_tiocbs(struct tqueue *queue, int err) { int queued; struct tiocb *tiocb; if (!queue->queued) return 0; /* * td_complete may queue more tiocbs, which * will overwrite the contents of queue->iocbs. * use a private linked list to keep track * of the tiocbs we're cancelling. */ tiocb = queue->iocbs[0]->data; queued = queue->queued; queue->queued = 0; for (; tiocb != NULL; tiocb = tiocb->next) complete_tiocb(queue, tiocb, err); return queued; } static int fail_tiocbs(struct tqueue *queue, int succeeded, int total, int err) { ERR(err, "io_submit error: %d of %d failed", total - succeeded, total); /* take any non-submitted, merged iocbs * off of the queue, split them, and fail them */ queue->queued = io_expand_iocbs(&queue->opioctx, queue->iocbs, succeeded, total); return cancel_tiocbs(queue, err); } /* * rwio */ struct rwio { struct io_event *aio_events; }; static void tapdisk_rwio_destroy(struct tqueue *queue) { struct rwio *rwio = queue->tio_data; if (rwio->aio_events) { free(rwio->aio_events); rwio->aio_events = NULL; } } static int tapdisk_rwio_setup(struct tqueue *queue, int size) { struct rwio *rwio = queue->tio_data; rwio->aio_events = calloc(size, sizeof(struct io_event)); if (!rwio->aio_events) return -errno; return 0; } static inline ssize_t tapdisk_rwio_rw(const struct iocb *iocb) { int fd = iocb->aio_fildes; char *buf = iocb->u.c.buf; long long off = iocb->u.c.offset; size_t size = iocb->u.c.nbytes; ssize_t (*func)(int, void *, size_t) = (iocb->aio_lio_opcode == IO_CMD_PWRITE ? vwrite : read); if (lseek64(fd, off, SEEK_SET) == (off64_t)-1) return -errno; if (atomicio(func, fd, buf, size) != size) return -errno; return size; } static int tapdisk_rwio_submit(struct tqueue *queue) { struct rwio *rwio = queue->tio_data; int i, merged, split; struct iocb *iocb; struct tiocb *tiocb; struct io_event *ep; if (!queue->queued) return 0; tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued); merged = io_merge(&queue->opioctx, queue->iocbs, queue->queued); queue->queued = 0; for (i = 0; i < merged; i++) { ep = rwio->aio_events + i; iocb = queue->iocbs[i]; ep->obj = iocb; ep->res = tapdisk_rwio_rw(iocb); } split = io_split(&queue->opioctx, rwio->aio_events, merged); tapdisk_filter_events(queue->filter, rwio->aio_events, split); for (i = split, ep = rwio->aio_events; i-- > 0; ep++) { iocb = ep->obj; tiocb = iocb->data; complete_tiocb(queue, tiocb, ep->res); } queue_deferred_tiocbs(queue); return split; } static const struct tio td_tio_rwio = { .name = "rwio", .data_size = 0, .tio_setup = tapdisk_rwio_setup, .tio_destroy = tapdisk_rwio_destroy, .tio_submit = tapdisk_rwio_submit }; /* * libaio */ struct lio { io_context_t aio_ctx; struct io_event *aio_events; int event_fd; int event_id; int flags; }; #define LIO_FLAG_EVENTFD (1<<0) static int tapdisk_lio_check_resfd(void) { return tapdisk_linux_version() >= KERNEL_VERSION(2, 6, 22); } static void tapdisk_lio_destroy_aio(struct tqueue *queue) { struct lio *lio = queue->tio_data; if (lio->event_fd >= 0) { close(lio->event_fd); lio->event_fd = -1; } if (lio->aio_ctx) { io_destroy(lio->aio_ctx); lio->aio_ctx = 0; } } static int __lio_setup_aio_poll(struct tqueue *queue, int qlen) { struct lio *lio = queue->tio_data; int err, fd; lio->aio_ctx = REQUEST_ASYNC_FD; fd = io_setup(qlen, &lio->aio_ctx); if (fd < 0) { lio->aio_ctx = 0; err = -errno; if (err == -EINVAL) goto fail_fd; goto fail; } lio->event_fd = fd; return 0; fail_fd: DPRINTF("Couldn't get fd for AIO poll support. This is probably " "because your kernel does not have the aio-poll patch " "applied.\n"); fail: return err; } static int __lio_setup_aio_eventfd(struct tqueue *queue, int qlen) { struct lio *lio = queue->tio_data; int err; err = io_setup(qlen, &lio->aio_ctx); if (err < 0) { lio->aio_ctx = 0; return err; } lio->event_fd = tapdisk_sys_eventfd(0); if (lio->event_fd < 0) return -errno; lio->flags |= LIO_FLAG_EVENTFD; return 0; } static int tapdisk_lio_setup_aio(struct tqueue *queue, int qlen) { struct lio *lio = queue->tio_data; int err; lio->aio_ctx = 0; lio->event_fd = -1; /* * prefer the mainline eventfd(2) api, if available. * if not, fall back to the poll fd patch. */ err = !tapdisk_lio_check_resfd(); if (!err) err = __lio_setup_aio_eventfd(queue, qlen); if (err) err = __lio_setup_aio_poll(queue, qlen); if (err == -EAGAIN) goto fail_rsv; fail: return err; fail_rsv: DPRINTF("Couldn't setup AIO context. If you are trying to " "concurrently use a large number of blktap-based disks, you may " "need to increase the system-wide aio request limit. " "(e.g. 'echo 1048576 > /proc/sys/fs/aio-max-nr')\n"); goto fail; } static void tapdisk_lio_destroy(struct tqueue *queue) { struct lio *lio = queue->tio_data; if (!lio) return; if (lio->event_id >= 0) { tapdisk_server_unregister_event(lio->event_id); lio->event_id = -1; } tapdisk_lio_destroy_aio(queue); if (lio->aio_events) { free(lio->aio_events); lio->aio_events = NULL; } } static void tapdisk_lio_set_eventfd(struct tqueue *queue, int n, struct iocb **iocbs) { struct lio *lio = queue->tio_data; int i; if (lio->flags & LIO_FLAG_EVENTFD) for (i = 0; i < n; ++i) __io_set_eventfd(iocbs[i], lio->event_fd); } static void tapdisk_lio_ack_event(struct tqueue *queue) { struct lio *lio = queue->tio_data; uint64_t val; if (lio->flags & LIO_FLAG_EVENTFD) { int gcc = read(lio->event_fd, &val, sizeof(val)); if (gcc) {}; } } static void tapdisk_lio_event(event_id_t id, char mode, void *private) { struct tqueue *queue = private; struct lio *lio; int i, ret, split; struct iocb *iocb; struct tiocb *tiocb; struct io_event *ep; tapdisk_lio_ack_event(queue); lio = queue->tio_data; ret = io_getevents(lio->aio_ctx, 0, queue->size, lio->aio_events, NULL); split = io_split(&queue->opioctx, lio->aio_events, ret); tapdisk_filter_events(queue->filter, lio->aio_events, split); DBG("events: %d, tiocbs: %d\n", ret, split); queue->iocbs_pending -= ret; queue->tiocbs_pending -= split; for (i = split, ep = lio->aio_events; i-- > 0; ep++) { iocb = ep->obj; tiocb = iocb->data; complete_tiocb(queue, tiocb, ep->res); } queue_deferred_tiocbs(queue); } static int tapdisk_lio_setup(struct tqueue *queue, int qlen) { struct lio *lio = queue->tio_data; int err; lio->event_id = -1; err = tapdisk_lio_setup_aio(queue, qlen); if (err) goto fail; lio->event_id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, lio->event_fd, 0, tapdisk_lio_event, queue); err = lio->event_id; if (err < 0) goto fail; lio->aio_events = calloc(qlen, sizeof(struct io_event)); if (!lio->aio_events) { err = -errno; goto fail; } return 0; fail: tapdisk_lio_destroy(queue); return err; } static int tapdisk_lio_submit(struct tqueue *queue) { struct lio *lio = queue->tio_data; int merged, submitted, err = 0; if (!queue->queued) return 0; tapdisk_filter_iocbs(queue->filter, queue->iocbs, queue->queued); merged = io_merge(&queue->opioctx, queue->iocbs, queue->queued); tapdisk_lio_set_eventfd(queue, merged, queue->iocbs); submitted = io_submit(lio->aio_ctx, merged, queue->iocbs); DBG("queued: %d, merged: %d, submitted: %d\n", queue->queued, merged, submitted); if (submitted < 0) { err = submitted; submitted = 0; } else if (submitted < merged) err = -EIO; queue->iocbs_pending += submitted; queue->tiocbs_pending += queue->queued; queue->queued = 0; if (err) queue->tiocbs_pending -= fail_tiocbs(queue, submitted, merged, err); return submitted; } static const struct tio td_tio_lio = { .name = "lio", .data_size = sizeof(struct lio), .tio_setup = tapdisk_lio_setup, .tio_destroy = tapdisk_lio_destroy, .tio_submit = tapdisk_lio_submit, }; static void tapdisk_queue_free_io(struct tqueue *queue) { if (queue->tio) { if (queue->tio->tio_destroy) queue->tio->tio_destroy(queue); queue->tio = NULL; } if (queue->tio_data) { free(queue->tio_data); queue->tio_data = NULL; } } static int tapdisk_queue_init_io(struct tqueue *queue, int drv) { const struct tio *tio; int err; switch (drv) { case TIO_DRV_LIO: tio = &td_tio_lio; break; case TIO_DRV_RWIO: tio = &td_tio_rwio; break; default: err = -EINVAL; goto fail; } queue->tio_data = calloc(1, tio->data_size); if (!queue->tio_data) { PERROR("malloc(%zu)", tio->data_size); err = -errno; goto fail; } queue->tio = tio; if (tio->tio_setup) { err = tio->tio_setup(queue, queue->size); if (err) goto fail; } DPRINTF("I/O queue driver: %s\n", tio->name); return 0; fail: tapdisk_queue_free_io(queue); return err; } int tapdisk_init_queue(struct tqueue *queue, int size, int drv, struct tfilter *filter) { int err; memset(queue, 0, sizeof(struct tqueue)); queue->size = size; queue->filter = filter; if (!size) return 0; err = tapdisk_queue_init_io(queue, drv); if (err) goto fail; queue->iocbs = calloc(size, sizeof(struct iocb *)); if (!queue->iocbs) { err = -errno; goto fail; } err = opio_init(&queue->opioctx, size); if (err) goto fail; return 0; fail: tapdisk_free_queue(queue); return err; } void tapdisk_free_queue(struct tqueue *queue) { tapdisk_queue_free_io(queue); free(queue->iocbs); queue->iocbs = NULL; opio_free(&queue->opioctx); } void tapdisk_debug_queue(struct tqueue *queue) { struct tiocb *tiocb = queue->deferred.head; WARN("TAPDISK QUEUE:\n"); WARN("size: %d, tio: %s, queued: %d, iocbs_pending: %d, " "tiocbs_pending: %d, tiocbs_deferred: %d, deferrals: %"PRIx64"\n", queue->size, queue->tio->name, queue->queued, queue->iocbs_pending, queue->tiocbs_pending, queue->tiocbs_deferred, queue->deferrals); if (tiocb) { WARN("deferred:\n"); for (; tiocb != NULL; tiocb = tiocb->next) { struct iocb *io = &tiocb->iocb; WARN("%s of %lu bytes at %lld\n", (io->aio_lio_opcode == IO_CMD_PWRITE ? "write" : "read"), io->u.c.nbytes, io->u.c.offset); } } } void tapdisk_prep_tiocb(struct tiocb *tiocb, int fd, int rw, char *buf, size_t size, long long offset, td_queue_callback_t cb, void *arg) { struct iocb *iocb = &tiocb->iocb; if (rw) io_prep_pwrite(iocb, fd, buf, size, offset); else io_prep_pread(iocb, fd, buf, size, offset); iocb->data = tiocb; tiocb->cb = cb; tiocb->arg = arg; tiocb->next = NULL; } void tapdisk_queue_tiocb(struct tqueue *queue, struct tiocb *tiocb) { if (!tapdisk_queue_full(queue)) queue_tiocb(queue, tiocb); else defer_tiocb(queue, tiocb); } /* * fail_tiocbs may queue more tiocbs */ int tapdisk_submit_tiocbs(struct tqueue *queue) { return queue->tio->tio_submit(queue); } int tapdisk_submit_all_tiocbs(struct tqueue *queue) { int submitted = 0; do { submitted += tapdisk_submit_tiocbs(queue); } while (!tapdisk_queue_empty(queue)); return submitted; } /* * cancel_tiocbs may queue more tiocbs */ int tapdisk_cancel_tiocbs(struct tqueue *queue) { return cancel_tiocbs(queue, -EIO); } int tapdisk_cancel_all_tiocbs(struct tqueue *queue) { int cancelled = 0; do { cancelled += tapdisk_cancel_tiocbs(queue); } while (!tapdisk_queue_empty(queue)); return cancelled; } blktap-2.0.90/drivers/tapdisk-server.c0000644000000000000000000002064411664745551016366 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "tapdisk-syslog.h" #include "tapdisk-server.h" #include "tapdisk-driver.h" #include "tapdisk-interface.h" #include "tapdisk-log.h" #define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a) #define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) #define TAPDISK_TIOCBS (TAPDISK_DATA_REQUESTS + 50) typedef struct tapdisk_server { int run; struct list_head vbds; scheduler_t scheduler; struct tqueue aio_queue; char *name; char *ident; int facility; } tapdisk_server_t; static tapdisk_server_t server; #define tapdisk_server_for_each_vbd(vbd, tmp) \ list_for_each_entry_safe(vbd, tmp, &server.vbds, next) td_image_t * tapdisk_server_get_shared_image(td_image_t *image) { td_vbd_t *vbd, *tmpv; td_image_t *img, *tmpi; if (!td_flag_test(image->flags, TD_OPEN_SHAREABLE)) return NULL; tapdisk_server_for_each_vbd(vbd, tmpv) tapdisk_vbd_for_each_image(vbd, img, tmpi) if (img->type == image->type && !strcmp(img->name, image->name)) return img; return NULL; } struct list_head * tapdisk_server_get_all_vbds(void) { return &server.vbds; } td_vbd_t * tapdisk_server_get_vbd(uint16_t uuid) { td_vbd_t *vbd, *tmp; tapdisk_server_for_each_vbd(vbd, tmp) if (vbd->uuid == uuid) return vbd; return NULL; } void tapdisk_server_add_vbd(td_vbd_t *vbd) { list_add_tail(&vbd->next, &server.vbds); } void tapdisk_server_remove_vbd(td_vbd_t *vbd) { list_del(&vbd->next); INIT_LIST_HEAD(&vbd->next); tapdisk_server_check_state(); } void tapdisk_server_queue_tiocb(struct tiocb *tiocb) { tapdisk_queue_tiocb(&server.aio_queue, tiocb); } void tapdisk_server_debug(void) { td_vbd_t *vbd, *tmp; tapdisk_debug_queue(&server.aio_queue); tapdisk_server_for_each_vbd(vbd, tmp) tapdisk_vbd_debug(vbd); DBG(TLOG_INFO, "debug log completed\n"); tlog_precious(); } void tapdisk_server_check_state(void) { if (list_empty(&server.vbds)) server.run = 0; } event_id_t tapdisk_server_register_event(char mode, int fd, int timeout, event_cb_t cb, void *data) { return scheduler_register_event(&server.scheduler, mode, fd, timeout, cb, data); } void tapdisk_server_unregister_event(event_id_t event) { return scheduler_unregister_event(&server.scheduler, event); } void tapdisk_server_mask_event(event_id_t event, int masked) { return scheduler_mask_event(&server.scheduler, event, masked); } void tapdisk_server_set_max_timeout(int seconds) { scheduler_set_max_timeout(&server.scheduler, seconds); } static void tapdisk_server_assert_locks(void) { } static void tapdisk_server_set_retry_timeout(void) { td_vbd_t *vbd, *tmp; tapdisk_server_for_each_vbd(vbd, tmp) if (tapdisk_vbd_retry_needed(vbd)) { tapdisk_server_set_max_timeout(TD_VBD_RETRY_INTERVAL); return; } } static void tapdisk_server_check_progress(void) { struct timeval now; td_vbd_t *vbd, *tmp; gettimeofday(&now, NULL); tapdisk_server_for_each_vbd(vbd, tmp) tapdisk_vbd_check_progress(vbd); } static void tapdisk_server_submit_tiocbs(void) { tapdisk_submit_all_tiocbs(&server.aio_queue); } static void tapdisk_server_kick_responses(void) { td_vbd_t *vbd, *tmp; tapdisk_server_for_each_vbd(vbd, tmp) tapdisk_vbd_kick(vbd); } static void tapdisk_server_check_vbds(void) { td_vbd_t *vbd, *tmp; tapdisk_server_for_each_vbd(vbd, tmp) tapdisk_vbd_check_state(vbd); } static int tapdisk_server_recheck_vbds(void) { td_vbd_t *vbd, *tmp; int rv = 0; tapdisk_server_for_each_vbd(vbd, tmp) rv += tapdisk_vbd_recheck_state(vbd); return rv; } static void tapdisk_server_stop_vbds(void) { td_vbd_t *vbd, *tmp; tapdisk_server_for_each_vbd(vbd, tmp) tapdisk_vbd_kill_queue(vbd); } static int tapdisk_server_init_aio(void) { return tapdisk_init_queue(&server.aio_queue, TAPDISK_TIOCBS, TIO_DRV_LIO, NULL); } static void tapdisk_server_close_aio(void) { tapdisk_free_queue(&server.aio_queue); } int tapdisk_server_openlog(const char *name, int options, int facility) { server.facility = facility; server.name = strdup(name); server.ident = tapdisk_syslog_ident(name); if (!server.name || !server.ident) return -errno; openlog(server.ident, options, facility); return 0; } void tapdisk_server_closelog(void) { closelog(); free(server.name); server.name = NULL; free(server.ident); server.ident = NULL; } static int tapdisk_server_open_tlog(void) { int err = 0; if (server.name) err = tlog_open(server.name, server.facility, TLOG_WARN); return err; } static void tapdisk_server_close_tlog(void) { tlog_close(); } static void tapdisk_server_close(void) { tapdisk_server_close_tlog(); tapdisk_server_close_aio(); } void tapdisk_server_iterate(void) { int ret; tapdisk_server_assert_locks(); tapdisk_server_set_retry_timeout(); tapdisk_server_check_progress(); ret = scheduler_wait_for_events(&server.scheduler); if (ret < 0) DBG(TLOG_WARN, "server wait returned %d\n", ret); tapdisk_server_check_vbds(); do { tapdisk_server_submit_tiocbs(); tapdisk_server_kick_responses(); ret = tapdisk_server_recheck_vbds(); } while (ret); } static void __tapdisk_server_run(void) { while (server.run) tapdisk_server_iterate(); } static void tapdisk_server_signal_handler(int signal) { td_vbd_t *vbd, *tmp; static int xfsz_error_sent = 0; switch (signal) { case SIGBUS: case SIGINT: tapdisk_server_for_each_vbd(vbd, tmp) tapdisk_vbd_close(vbd); break; case SIGXFSZ: ERR(EFBIG, "received SIGXFSZ"); tapdisk_server_stop_vbds(); if (xfsz_error_sent) break; xfsz_error_sent = 1; break; case SIGUSR1: DBG(TLOG_INFO, "debugging on signal %d\n", signal); tapdisk_server_debug(); break; } } int tapdisk_server_init(void) { memset(&server, 0, sizeof(server)); INIT_LIST_HEAD(&server.vbds); scheduler_initialize(&server.scheduler); return 0; } int tapdisk_server_complete(void) { int err; err = tapdisk_server_init_aio(); if (err) goto fail; err = tapdisk_server_open_tlog(); if (err) goto fail; server.run = 1; return 0; fail: tapdisk_server_close_tlog(); tapdisk_server_close_aio(); return err; } int tapdisk_server_initialize(const char *read, const char *write) { int err; tapdisk_server_init(); err = tapdisk_server_complete(); if (err) goto fail; return 0; fail: tapdisk_server_close(); return err; } int tapdisk_server_run() { int err; err = tapdisk_set_resource_limits(); if (err) return err; signal(SIGBUS, tapdisk_server_signal_handler); signal(SIGINT, tapdisk_server_signal_handler); signal(SIGUSR1, tapdisk_server_signal_handler); signal(SIGXFSZ, tapdisk_server_signal_handler); __tapdisk_server_run(); tapdisk_server_close(); return 0; } blktap-2.0.90/drivers/tapdisk-log.h0000644000000000000000000000521711664745551015645 0ustar rootroot/* * Copyright (c) 2009, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TAPDISK_LOG_H_ #define _TAPDISK_LOG_H_ #define TLOG_WARN 0 #define TLOG_INFO 1 #define TLOG_DBG 2 #define TLOG_DIR "/var/log/blktap" #include #include "compiler.h" int tlog_open(const char *, int, int); void tlog_close(void); void tlog_precious(void); void tlog_vsyslog(int, const char *, va_list); void tlog_syslog(int, const char *, ...) __printf(2, 3); #include #define EPRINTF(_f, _a...) syslog(LOG_ERR, "tap-err:%s: " _f, __func__, ##_a) #define DPRINTF(_f, _a...) syslog(LOG_INFO, _f, ##_a) #define PERROR(_f, _a...) EPRINTF(_f ": %s", ##_a, strerror(errno)) void __tlog_write(int, const char *, ...) __printf(2, 3); void __tlog_error(const char *fmt, ...) __printf(1, 2); #define tlog_write(_level, _f, _a...) \ __tlog_write(_level, "%s: " _f, __func__, ##_a) #define tlog_error(_err, _f, _a...) \ __tlog_error("ERROR: errno %d at %s: " _f, \ (int)_err, __func__, ##_a) #define tlog_drv_error(_drv, _err, _f, _a ...) do { \ if (tapdisk_driver_log_pass(_drv, __func__)) \ tlog_error(_err, _f, ##_a); \ } while (0) #endif blktap-2.0.90/drivers/atomicio.c0000644000000000000000000000371311664745551015225 0ustar rootroot/* * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved. * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include "atomicio.h" /* * ensure all of data on socket comes through. f==read || f==vwrite */ size_t atomicio(f, fd, _s, n) ssize_t (*f) (int, void *, size_t); int fd; void *_s; size_t n; { char *s = _s; size_t pos = 0; ssize_t res; while (n > pos) { res = (f) (fd, s + pos, n - pos); switch (res) { case -1: if (errno == EINTR || errno == EAGAIN) continue; return 0; case 0: errno = EPIPE; return pos; default: pos += (size_t)res; } } return (pos); } blktap-2.0.90/drivers/libaio-compat.h0000644000000000000000000000515311664745551016146 0ustar rootroot/* * Copyright (c) 2010, XenSource Inc. * All rights reserved. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA */ /* * kernel 2.6.21 added eventfd(2) support, kernel 2.6.22 eventfds for * aio. libaio 0.3.107 updated the header file, but few systems have * it. define a custom iocb_common struct instead, and work around a * potentially missing sys/eventfd.h. this header should vanish over * time. */ #ifndef __LIBAIO_COMPAT #define __LIBAIO_COMPAT #include #include #include struct __compat_io_iocb_common { char __pad_buf[8]; char __pad_nbytes[8]; long long offset; long long __pad3; unsigned flags; unsigned resfd; }; static inline void __io_set_eventfd(struct iocb *iocb, int eventfd) { struct __compat_io_iocb_common *c; c = (struct __compat_io_iocb_common*)&iocb->u.c; c->flags |= (1 << 0); c->resfd = eventfd; } #ifndef SYS_eventfd #ifndef __NR_eventfd # if defined(__alpha__) # define __NR_eventfd 478 # elif defined(__arm__) # define __NR_eventfd (__NR_SYSCALL_BASE+351) # elif defined(__ia64__) # define __NR_eventfd 1309 # elif defined(__i386__) # define __NR_eventfd 323 # elif defined(__m68k__) # define __NR_eventfd 319 # elif 0 && defined(__mips__) # error __NR_eventfd? # define __NR_eventfd (__NR_Linux + 319) # define __NR_eventfd (__NR_Linux + 278) # define __NR_eventfd (__NR_Linux + 282) # elif defined(__hppa__) # define __NR_eventfd (__NR_Linux + 304) # elif defined(__PPC__) || defined(__powerpc64__) # define __NR_eventfd 307 # elif defined(__s390__) || defined(__s390x__) # define __NR_eventfd 318 # elif defined(__sparc__) # define __NR_eventfd 313 # elif defined(__x86_64__) # define __NR_eventfd 284 # endif #else # error __NR_eventfd? #endif #define SYS_eventfd __NR_eventfd #endif static inline int tapdisk_sys_eventfd(int initval) { return syscall(SYS_eventfd, initval, 0); } #endif /* __LIBAIO_COMPAT */ blktap-2.0.90/drivers/tapdisk-blktap.c0000644000000000000000000003204611664745551016334 0ustar rootroot/* * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include "blktap.h" #include "tapdisk-vbd.h" #include "tapdisk-blktap.h" #include "tapdisk-server.h" #include "linux-blktap.h" #define BUG(_cond) td_panic() #define BUG_ON(_cond) if (unlikely(_cond)) { td_panic(); } #define DBG(_f, _a...) tlog_syslog(TLOG_DBG, _f, ##_a) #define INFO(_f, _a...) tlog_syslog(TLOG_INFO, _f, ##_a) #define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) #define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f "in %s:%d", \ ##_a, __func__, __LINE__) #define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1)) #define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x)) #define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x)) #define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x)) #define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x)) #define BLKTAP_RD32(_n) __RD32(_n) #define BLKTAP_RING_SIZE __BLKTAP_RING_SIZE(BLKTAP_PAGE_SIZE) #define BLKTAP_PAGE_SIZE sysconf(_SC_PAGE_SIZE) #define BLKTAP_GET_RESPONSE(_tap, _idx) \ (&(_tap)->sring->entry[(_idx) % BLKTAP_RING_SIZE].rsp) #define BLKTAP_GET_REQUEST(_tap, _idx) \ (&(_tap)->sring->entry[(_idx) % BLKTAP_RING_SIZE].req) static void __tapdisk_blktap_close(td_blktap_t *); struct td_blktap_req { td_vbd_request_t vreq; unsigned int id; char name[16]; struct td_iovec iov[BLKTAP_SEGMENT_MAX]; }; td_blktap_req_t * tapdisk_blktap_alloc_request(td_blktap_t *tap) { td_blktap_req_t *req = NULL; if (likely(tap->n_reqs_free)) req = tap->reqs_free[--tap->n_reqs_free]; return req; } void tapdisk_blktap_free_request(td_blktap_t *tap, td_blktap_req_t *req) { BUG_ON(tap->n_reqs_free >= tap->n_reqs); tap->reqs_free[tap->n_reqs_free++] = req; } static void tapdisk_blktap_reqs_free(td_blktap_t *tap) { if (tap->reqs) { free(tap->reqs); tap->reqs = NULL; } if (tap->reqs_free) { free(tap->reqs_free); tap->reqs_free = NULL; } } static int tapdisk_blktap_reqs_init(td_blktap_t *tap, int n_reqs) { int i, err; tap->reqs = malloc(n_reqs * sizeof(td_blktap_req_t)); if (!tap->reqs) { err = -errno; goto fail; } tap->reqs_free = malloc(n_reqs * sizeof(td_blktap_req_t*)); if (!tap->reqs_free) { err = -errno; goto fail; } tap->n_reqs = n_reqs; tap->n_reqs_free = 0; for (i = 0; i < n_reqs; i++) tapdisk_blktap_free_request(tap, &tap->reqs[i]); return 0; fail: tapdisk_blktap_reqs_free(tap); return err; } static void tapdisk_blktap_kick(td_blktap_t *tap) { if (likely(tap->fd >= 0)) { ioctl(tap->fd, BLKTAP_IOCTL_RESPOND, 0); tap->stats.kicks.out++; } } static int tapdisk_blktap_error_status(td_blktap_t *tap, int error) { int status; switch (error) { case 0: status = BLKTAP_RSP_OKAY; break; case -EOPNOTSUPP: case EOPNOTSUPP: status = BLKTAP_RSP_EOPNOTSUPP; break; default: status = BLKTAP_RSP_ERROR; break; } return status; } static void __tapdisk_blktap_push_response(td_blktap_t *tap, int final) { tap->rsp_prod_pvt++; if (final) { tap->sring->rsp_prod = tap->rsp_prod_pvt; tapdisk_blktap_kick(tap); } tap->stats.reqs.out++; } static void tapdisk_blktap_fail_request(td_blktap_t *tap, blktap_ring_req_t *msg, int error) { blktap_ring_rsp_t *rsp; BUG_ON(!tap->vma); rsp = BLKTAP_GET_RESPONSE(tap, tap->rsp_prod_pvt); rsp->id = msg->id; rsp->operation = msg->operation; rsp->status = tapdisk_blktap_error_status(tap, error); __tapdisk_blktap_push_response(tap, 1); } static void tapdisk_blktap_put_response(td_blktap_t *tap, td_blktap_req_t *req, int error, int final) { blktap_ring_rsp_t *rsp; int op = 0; BUG_ON(!tap->vma); rsp = BLKTAP_GET_RESPONSE(tap, tap->rsp_prod_pvt); switch (req->vreq.op) { case TD_OP_READ: op = BLKTAP_OP_READ; break; case TD_OP_WRITE: op = BLKTAP_OP_WRITE; break; default: BUG(); } rsp->id = req->id; rsp->operation = op; rsp->status = tapdisk_blktap_error_status(tap, error); __tapdisk_blktap_push_response(tap, final); } static void tapdisk_blktap_complete_request(td_blktap_t *tap, td_blktap_req_t *req, int error, int final) { if (likely(tap->vma)) tapdisk_blktap_put_response(tap, req, error, final); tapdisk_blktap_free_request(tap, req); } static void __tapdisk_blktap_request_cb(td_vbd_request_t *vreq, int error, void *token, int final) { td_blktap_req_t *req = containerof(vreq, td_blktap_req_t, vreq); td_blktap_t *tap = token; tapdisk_blktap_complete_request(tap, req, error, final); } static void tapdisk_blktap_vector_request(td_blktap_t *tap, const blktap_ring_req_t *msg, td_blktap_req_t *req) { td_vbd_request_t *vreq = &req->vreq; const struct blktap_segment *seg; struct td_iovec *iov; void *page, *next, *last; size_t size; int i; iov = req->iov - 1; last = NULL; page = tap->vstart; page += msg->id * BLKTAP_SEGMENT_MAX * BLKTAP_PAGE_SIZE; for (i = 0; i < msg->nr_segments; i++) { seg = &msg->seg[i]; next = page + (seg->first_sect << SECTOR_SHIFT); size = seg->last_sect - seg->first_sect + 1; if (next != last) { iov++; iov->base = next; iov->secs = size; } else iov->secs += size; last = iov->base + (iov->secs << SECTOR_SHIFT); page += BLKTAP_PAGE_SIZE; } vreq->iov = req->iov; vreq->iovcnt = iov - req->iov + 1; vreq->sec = msg->sector_number; } static int tapdisk_blktap_parse_request(td_blktap_t *tap, const blktap_ring_req_t *msg, td_blktap_req_t *req) { td_vbd_request_t *vreq = &req->vreq; int op, err = -EINVAL; memset(req, 0, sizeof(*req)); switch (msg->operation) { case BLKTAP_OP_READ: op = TD_OP_READ; break; case BLKTAP_OP_WRITE: op = TD_OP_WRITE; break; default: goto fail; } if (msg->id > BLKTAP_RING_SIZE) goto fail; if (msg->nr_segments < 1 || msg->nr_segments > BLKTAP_SEGMENT_MAX) goto fail; req->id = msg->id; snprintf(req->name, sizeof(req->name), "tap-%d.%d", tap->minor, req->id); vreq->op = op; vreq->name = req->name; vreq->token = tap; vreq->cb = __tapdisk_blktap_request_cb; tapdisk_blktap_vector_request(tap, msg, req); err = 0; fail: return err; } static void tapdisk_blktap_get_requests(td_blktap_t *tap) { unsigned int rp, rc; int err; rp = tap->sring->req_prod; for (rc = tap->req_cons; rc != rp; rc++) { blktap_ring_req_t *msg = BLKTAP_GET_REQUEST(tap, rc); td_blktap_req_t *req; tap->stats.reqs.in++; req = tapdisk_blktap_alloc_request(tap); if (!req) { err = -EFAULT; goto fail_ring; } err = tapdisk_blktap_parse_request(tap, msg, req); if (err) { tapdisk_blktap_fail_request(tap, msg, err); tapdisk_blktap_free_request(tap, req); goto fail_ring; } err = tapdisk_vbd_queue_request(tap->vbd, &req->vreq); if (err) tapdisk_blktap_complete_request(tap, req, err, 1); } tap->req_cons = rc; return; fail_ring: ERR(err, "ring error, disconnecting."); __tapdisk_blktap_close(tap); } static void tapdisk_blktap_fd_event(event_id_t id, char mode, void *data) { td_blktap_t *tap = data; tap->stats.kicks.in++; tapdisk_blktap_get_requests(tap); } int tapdisk_blktap_remove_device(td_blktap_t *tap) { int err = 0; if (likely(tap->fd >= 0)) { err = ioctl(tap->fd, BLKTAP_IOCTL_REMOVE_DEVICE); if (err) err = -errno; } return err; } int tapdisk_blktap_compat_create_device(td_blktap_t *tap, const struct blktap_device_info *bdi) { struct blktap2_params params; int err; memset(¶ms, 0, sizeof(params)); params.capacity = bdi->capacity; params.sector_size = bdi->sector_size; err = ioctl(tap->fd, BLKTAP_IOCTL_CREATE_DEVICE_COMPAT, ¶ms); if (err) { err = -errno; return err; } if (bdi->flags || bdi->physical_sector_size != bdi->sector_size) WARN("fell back to compat ioctl(%d)", BLKTAP_IOCTL_CREATE_DEVICE_COMPAT); return 0; } #ifndef ENOIOCTLCMD #define ENOIOCTLCMD 515 #endif int tapdisk_blktap_create_device(td_blktap_t *tap, const td_disk_info_t *info, int rdonly) { struct blktap_device_info bdi; unsigned long flags; int err; memset(&bdi, 0, sizeof(bdi)); flags = 0; flags |= rdonly & TD_OPEN_RDONLY ? BLKTAP_DEVICE_RO : 0; bdi.capacity = info->size; bdi.sector_size = info->sector_size; bdi.physical_sector_size = info->sector_size; bdi.flags = flags; INFO("bdev: capacity=%llu sector_size=%u/%u flags=%#lx", bdi.capacity, bdi.sector_size, bdi.physical_sector_size, bdi.flags); err = ioctl(tap->fd, BLKTAP_IOCTL_CREATE_DEVICE, &bdi); if (!err) return 0; err = -errno; if (err == -ENOTTY || err == -ENOIOCTLCMD) err = tapdisk_blktap_compat_create_device(tap, &bdi); return err; } static void tapdisk_blktap_unmap(td_blktap_t *tap) { if (tap->vma) { munmap(tap->vma, tap->vma_size); tap->vma = NULL; } } static int tapdisk_blktap_map(td_blktap_t *tap) { int prot, flags, err; void *vma; tap->vma_size = 1 + (BLKTAP_RING_SIZE * BLKTAP_SEGMENT_MAX * BLKTAP_PAGE_SIZE); prot = PROT_READ | PROT_WRITE; flags = MAP_SHARED; vma = mmap(NULL, tap->vma_size, prot, flags, tap->fd, 0); if (vma == MAP_FAILED) { err = -errno; goto fail; } tap->vma = vma; tap->vstart = vma + BLKTAP_PAGE_SIZE; tap->req_cons = 0; tap->rsp_prod_pvt = 0; tap->sring = vma; return 0; fail: tapdisk_blktap_unmap(tap); return err; } static void __tapdisk_blktap_close(td_blktap_t *tap) { /* * NB. this can bail out at runtime. after munmap, blktap * already failed all pending block reqs. AIO on buffers will * -EFAULT. vreq completion just backs off once fd/vma are * gone, so we'll drain, then idle until close(). */ if (tap->event_id >= 0) { tapdisk_server_unregister_event(tap->event_id); tap->event_id = -1; } tapdisk_blktap_unmap(tap); if (tap->fd >= 0) { close(tap->fd); tap->fd = -1; } } void tapdisk_blktap_close(td_blktap_t *tap) { __tapdisk_blktap_close(tap); tapdisk_blktap_reqs_free(tap); free(tap); } int tapdisk_blktap_open(const char *devname, td_vbd_t *vbd, td_blktap_t **_tap) { td_blktap_t *tap; struct stat st; int err; tap = malloc(sizeof(*tap)); if (!tap) { err = -errno; goto fail; } memset(tap, 0, sizeof(*tap)); tap->fd = -1; tap->event_id = -1; tap->fd = open(devname, O_RDWR); if (tap->fd < 0) { err = -errno; goto fail; } err = fstat(tap->fd, &st); if (err) { err = -errno; goto fail; } tap->vbd = vbd; tap->minor = minor(st.st_rdev); err = tapdisk_blktap_map(tap); if (err) goto fail; tap->event_id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, tap->fd, 0, tapdisk_blktap_fd_event, tap); if (tap->event_id < 0) { err = tap->event_id; goto fail; } err = tapdisk_blktap_reqs_init(tap, BLKTAP_RING_SIZE); if (err) goto fail; if (_tap) *_tap = tap; return 0; fail: if (tap) tapdisk_blktap_close(tap); return err; } void tapdisk_blktap_stats(td_blktap_t *tap, td_stats_t *st) { tapdisk_stats_field(st, "minor", "d", tap->minor); tapdisk_stats_field(st, "reqs", "["); tapdisk_stats_val(st, "llu", tap->stats.reqs.in); tapdisk_stats_val(st, "llu", tap->stats.reqs.out); tapdisk_stats_leave(st, ']'); tapdisk_stats_field(st, "kicks", "["); tapdisk_stats_val(st, "llu", tap->stats.kicks.in); tapdisk_stats_val(st, "llu", tap->stats.kicks.out); tapdisk_stats_leave(st, ']'); } blktap-2.0.90/drivers/tapdisk-ring.h0000644000000000000000000000621211664745551016017 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TAPDISK_RING_H_ #define _TAPDISK_RING_H_ #include #include #include typedef struct td_uring td_uring_t; typedef struct td_uring_header td_uring_header_t; typedef struct td_uring_request td_uring_request_t; typedef struct td_uring_response td_uring_response_t; struct td_uring { int ctlfd; char *shmem_path; char *ctlfd_path; void *shmem; void *ring_area; void *data_area; }; struct td_uring_header { char cookie[8]; uint32_t version; uint32_t shmem_size; uint32_t ring_size; uint32_t data_size; char reserved[4064]; }; struct td_uring_request { uint8_t op; uint64_t id; uint64_t sec; uint32_t secs; uint32_t offset; }; struct td_uring_response { uint8_t op; uint64_t id; uint8_t status; }; DEFINE_RING_TYPES(td_uring, td_uring_request_t, td_uring_response_t); int tapdisk_uring_create(td_uring_t *, const char *location, uint32_t ring_size, uint32_t data_size); int tapdisk_uring_destroy(td_uring_t *); int tapdisk_uring_connect(td_uring_t *, const char *location); int tapdisk_uring_disconnect(td_uring_t *); int tapdisk_uring_poll(td_uring_t *); int tapdisk_uring_kick(td_uring_t *); #endif blktap-2.0.90/drivers/tapdisk-vbd.c0000644000000000000000000007311111664745551015630 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include "libvhd.h" #include "tapdisk-blktap.h" #include "tapdisk-image.h" #include "tapdisk-driver.h" #include "tapdisk-server.h" #include "tapdisk-vbd.h" #include "tapdisk-disktype.h" #include "tapdisk-interface.h" #include "tapdisk-stats.h" #include "tapdisk-storage.h" #define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a) #define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) #if 1 #define ASSERT(p) \ do { \ if (!(p)) { \ DPRINTF("Assertion '%s' failed, line %d, " \ "file %s", #p, __LINE__, __FILE__); \ *(int*)0 = 0; \ } \ } while (0) #else #define ASSERT(p) ((void)0) #endif #define TD_VBD_EIO_RETRIES 10 #define TD_VBD_EIO_SLEEP 1 #define TD_VBD_WATCHDOG_TIMEOUT 10 static void tapdisk_vbd_complete_vbd_request(td_vbd_t *, td_vbd_request_t *); static int tapdisk_vbd_queue_ready(td_vbd_t *); static void tapdisk_vbd_check_queue_state(td_vbd_t *); /* * initialization */ static void tapdisk_vbd_mark_progress(td_vbd_t *vbd) { gettimeofday(&vbd->ts, NULL); } td_vbd_t* tapdisk_vbd_create(uint16_t uuid) { td_vbd_t *vbd; vbd = calloc(1, sizeof(td_vbd_t)); if (!vbd) { EPRINTF("failed to allocate tapdisk state\n"); return NULL; } vbd->uuid = uuid; INIT_LIST_HEAD(&vbd->images); INIT_LIST_HEAD(&vbd->new_requests); INIT_LIST_HEAD(&vbd->pending_requests); INIT_LIST_HEAD(&vbd->failed_requests); INIT_LIST_HEAD(&vbd->completed_requests); INIT_LIST_HEAD(&vbd->next); tapdisk_vbd_mark_progress(vbd); return vbd; } int tapdisk_vbd_initialize(int rfd, int wfd, uint16_t uuid) { td_vbd_t *vbd; vbd = tapdisk_server_get_vbd(uuid); if (vbd) { EPRINTF("duplicate vbds! %u\n", uuid); return -EEXIST; } vbd = tapdisk_vbd_create(uuid); tapdisk_server_add_vbd(vbd); return 0; } static int tapdisk_vbd_validate_chain(td_vbd_t *vbd) { return tapdisk_image_validate_chain(&vbd->images); } void tapdisk_vbd_close_vdi(td_vbd_t *vbd) { tapdisk_image_close_chain(&vbd->images); if (vbd->secondary && vbd->secondary_mode != TD_VBD_SECONDARY_MIRROR) { tapdisk_image_close(vbd->secondary); vbd->secondary = NULL; } if (vbd->retired) { tapdisk_image_close(vbd->retired); vbd->retired = NULL; } td_flag_set(vbd->state, TD_VBD_CLOSED); } static int tapdisk_vbd_add_block_cache(td_vbd_t *vbd) { td_image_t *cache, *image, *target, *tmp; int err; target = NULL; tapdisk_vbd_for_each_image(vbd, image, tmp) if (td_flag_test(image->flags, TD_OPEN_RDONLY) && td_flag_test(image->flags, TD_OPEN_SHAREABLE)) { target = image; break; } if (!target) return 0; cache = tapdisk_image_allocate(target->name, DISK_TYPE_BLOCK_CACHE, target->flags); if (!cache) return -ENOMEM; /* try to load existing cache */ err = td_load(cache); if (!err) goto done; /* hack driver to send open() correct image size */ if (!target->driver) { err = -ENODEV; goto fail; } cache->driver = tapdisk_driver_allocate(cache->type, cache->name, cache->flags); if (!cache->driver) { err = -ENOMEM; goto fail; } cache->driver->info = target->driver->info; /* try to open new cache */ err = td_open(cache); if (!err) goto done; fail: /* give up */ tapdisk_image_free(target); return err; done: /* insert cache before image */ list_add(&cache->next, target->next.prev); return 0; } static int tapdisk_vbd_add_local_cache(td_vbd_t *vbd) { td_image_t *cache, *parent; int err; parent = tapdisk_vbd_first_image(vbd); if (tapdisk_vbd_is_last_image(vbd, parent)) { DPRINTF("Single-image chain, nothing to cache"); return 0; } cache = tapdisk_image_allocate(parent->name, DISK_TYPE_LCACHE, parent->flags); if (!cache) return -ENOMEM; /* try to load existing cache */ err = td_load(cache); if (!err) goto done; cache->driver = tapdisk_driver_allocate(cache->type, cache->name, cache->flags); if (!cache->driver) { err = -ENOMEM; goto fail; } cache->driver->info = parent->driver->info; /* try to open new cache */ err = td_open(cache); if (!err) goto done; fail: tapdisk_image_free(cache); return err; done: /* insert cache right above leaf image */ list_add(&cache->next, &parent->next); DPRINTF("Added local_cache driver\n"); return 0; } int tapdisk_vbd_add_secondary(td_vbd_t *vbd) { td_image_t *leaf, *second = NULL; const char *path; int type, err; DPRINTF("Adding secondary image: %s\n", vbd->secondary_name); type = tapdisk_disktype_parse_params(vbd->secondary_name, &path); if (type < 0) return type; leaf = tapdisk_vbd_first_image(vbd); if (!leaf) { err = -EINVAL; goto fail; } err = tapdisk_image_open(type, path, leaf->flags, &second); if (err) goto fail; if (second->info.size != leaf->info.size) { EPRINTF("Secondary image size %"PRIu64" != image size %"PRIu64"\n", second->info.size, leaf->info.size); err = -EINVAL; goto fail; } vbd->secondary = second; leaf->flags |= TD_IGNORE_ENOSPC; if (td_flag_test(vbd->flags, TD_OPEN_STANDBY)) { DPRINTF("In standby mode\n"); vbd->secondary_mode = TD_VBD_SECONDARY_STANDBY; } else { DPRINTF("In mirror mode\n"); vbd->secondary_mode = TD_VBD_SECONDARY_MIRROR; /* we actually need this image to also be part of the chain, * since it may already contain data */ list_add(&second->next, &leaf->next); } DPRINTF("Added secondary image\n"); return 0; fail: if (second) tapdisk_image_close(second); return err; } static void signal_enospc(td_vbd_t *vbd) { int fd, err; char *fn; err = asprintf(&fn, BLKTAP2_ENOSPC_SIGNAL_FILE"%d", vbd->tap->minor); if (err == -1) { EPRINTF("Failed to signal ENOSPC condition\n"); return; } fd = open(fn, O_WRONLY | O_CREAT | O_NONBLOCK, 0666); if (fd == -1) EPRINTF("Failed to open file to signal ENOSPC condition\n"); else close(fd); free(fn); } #if 0 static int tapdisk_vbd_open_index(td_vbd_t *vbd) { int err; char *path; td_flag_t flags; td_image_t *last, *image; last = tapdisk_vbd_last_image(vbd); err = asprintf(&path, "%s.bat", last->name); if (err == -1) return -errno; err = access(path, R_OK); if (err == -1) { free(path); return -errno; } flags = vbd->flags | TD_OPEN_RDONLY | TD_OPEN_SHAREABLE; image = tapdisk_image_allocate(path, DISK_TYPE_VINDEX, flags); if (!image) { err = -ENOMEM; goto fail; } err = td_open(image); if (err) goto fail; tapdisk_vbd_add_image(vbd, image); return 0; fail: if (image) tapdisk_image_free(image); free(path); return err; } #endif static int tapdisk_vbd_add_dirty_log(td_vbd_t *vbd) { int err; td_driver_t *driver; td_image_t *log, *parent; driver = NULL; log = NULL; parent = tapdisk_vbd_first_image(vbd); log = tapdisk_image_allocate(parent->name, DISK_TYPE_LOG, parent->flags); if (!log) return -ENOMEM; driver = tapdisk_driver_allocate(log->type, log->name, log->flags); if (!driver) { err = -ENOMEM; goto fail; } driver->info = parent->driver->info; log->driver = driver; err = td_open(log); if (err) goto fail; tapdisk_vbd_add_image(vbd, log); return 0; fail: tapdisk_image_free(log); return err; } int tapdisk_vbd_open_vdi(td_vbd_t *vbd, const char *name, td_flag_t flags, int prt_devnum) { char *tmp = vbd->name; int err; if (!list_empty(&vbd->images)) { err = -EBUSY; goto fail; } if (!name && !vbd->name) { err = -EINVAL; goto fail; } if (name) { vbd->name = strdup(name); if (!vbd->name) { err = -errno; goto fail; } } err = tapdisk_image_open_chain(vbd->name, flags, prt_devnum, &vbd->images); if (err) goto fail; td_flag_clear(vbd->state, TD_VBD_CLOSED); vbd->flags = flags; if (td_flag_test(vbd->flags, TD_OPEN_LOG_DIRTY)) { err = tapdisk_vbd_add_dirty_log(vbd); if (err) goto fail; } if (td_flag_test(vbd->flags, TD_OPEN_ADD_CACHE)) { err = tapdisk_vbd_add_block_cache(vbd); if (err) goto fail; } if (td_flag_test(vbd->flags, TD_OPEN_LOCAL_CACHE)) { err = tapdisk_vbd_add_local_cache(vbd); if (err) goto fail; } err = tapdisk_vbd_validate_chain(vbd); if (err) goto fail; if (td_flag_test(vbd->flags, TD_OPEN_SECONDARY)) { err = tapdisk_vbd_add_secondary(vbd); if (err) goto fail; } if (tmp != vbd->name) free(tmp); return err; fail: if (vbd->name != tmp) { free(vbd->name); vbd->name = tmp; } if (!list_empty(&vbd->images)) tapdisk_image_close_chain(&vbd->images); vbd->flags = 0; return err; } void tapdisk_vbd_detach(td_vbd_t *vbd) { td_blktap_t *tap = vbd->tap; if (tap) { tapdisk_blktap_close(tap); vbd->tap = NULL; } } int tapdisk_vbd_attach(td_vbd_t *vbd, const char *devname, int minor) { if (vbd->tap) return -EALREADY; return tapdisk_blktap_open(devname, vbd, &vbd->tap); } int tapdisk_vbd_open(td_vbd_t *vbd, const char *name, int minor, const char *ring, td_flag_t flags) { int err; err = tapdisk_vbd_open_vdi(vbd, name, flags, -1); if (err) goto out; err = tapdisk_vbd_attach(vbd, ring, minor); if (err) goto out; return 0; out: tapdisk_vbd_detach(vbd); tapdisk_vbd_close_vdi(vbd); free(vbd->name); vbd->name = NULL; return err; } static void tapdisk_vbd_queue_count(td_vbd_t *vbd, int *new, int *pending, int *failed, int *completed) { int n, p, f, c; td_vbd_request_t *vreq, *tvreq; n = 0; p = 0; f = 0; c = 0; tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->new_requests) n++; tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->pending_requests) p++; tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->failed_requests) f++; tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->completed_requests) c++; *new = n; *pending = p; *failed = f; *completed = c; } static int tapdisk_vbd_shutdown(td_vbd_t *vbd) { int new, pending, failed, completed; if (!list_empty(&vbd->pending_requests)) return -EAGAIN; tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed); DPRINTF("%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, " "failed: 0x%02x, completed: 0x%02x\n", vbd->name, vbd->state, new, pending, failed, completed); DPRINTF("last activity: %010ld.%06ld, errors: 0x%04"PRIx64", " "retries: 0x%04"PRIx64", received: 0x%08"PRIx64", " "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n", vbd->ts.tv_sec, vbd->ts.tv_usec, vbd->errors, vbd->retries, vbd->received, vbd->returned, vbd->kicked); tapdisk_vbd_close_vdi(vbd); tapdisk_vbd_detach(vbd); tapdisk_server_remove_vbd(vbd); free(vbd->name); free(vbd); return 0; } int tapdisk_vbd_close(td_vbd_t *vbd) { /* * don't close if any requests are pending in the aio layer */ if (!list_empty(&vbd->pending_requests)) goto fail; /* * if the queue is still active and we have more * requests, try to complete them before closing. */ if (tapdisk_vbd_queue_ready(vbd) && (!list_empty(&vbd->new_requests) || !list_empty(&vbd->failed_requests) || !list_empty(&vbd->completed_requests))) goto fail; return tapdisk_vbd_shutdown(vbd); fail: td_flag_set(vbd->state, TD_VBD_SHUTDOWN_REQUESTED); DBG(TLOG_WARN, "%s: requests pending\n", vbd->name); return -EAGAIN; } /* * control operations */ void tapdisk_vbd_debug(td_vbd_t *vbd) { td_image_t *image, *tmp; int new, pending, failed, completed; tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed); DBG(TLOG_WARN, "%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, " "failed: 0x%02x, completed: 0x%02x, last activity: %010ld.%06ld, " "errors: 0x%04"PRIx64", retries: 0x%04"PRIx64", " "received: 0x%08"PRIx64", returned: 0x%08"PRIx64", " "kicked: 0x%08"PRIx64"\n", vbd->name, vbd->state, new, pending, failed, completed, vbd->ts.tv_sec, vbd->ts.tv_usec, vbd->errors, vbd->retries, vbd->received, vbd->returned, vbd->kicked); tapdisk_vbd_for_each_image(vbd, image, tmp) td_debug(image); } static void tapdisk_vbd_drop_log(td_vbd_t *vbd) { if (td_flag_test(vbd->state, TD_VBD_LOG_DROPPED)) return; tapdisk_vbd_debug(vbd); tlog_precious(); td_flag_set(vbd->state, TD_VBD_LOG_DROPPED); } int tapdisk_vbd_get_disk_info(td_vbd_t *vbd, td_disk_info_t *info) { if (list_empty(&vbd->images)) return -EINVAL; *info = tapdisk_vbd_first_image(vbd)->info; return 0; } static int tapdisk_vbd_queue_ready(td_vbd_t *vbd) { return (!td_flag_test(vbd->state, TD_VBD_DEAD) && !td_flag_test(vbd->state, TD_VBD_CLOSED) && !td_flag_test(vbd->state, TD_VBD_QUIESCED) && !td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED)); } int tapdisk_vbd_retry_needed(td_vbd_t *vbd) { return !(list_empty(&vbd->failed_requests) && list_empty(&vbd->new_requests)); } int tapdisk_vbd_lock(td_vbd_t *vbd) { return 0; } int tapdisk_vbd_quiesce_queue(td_vbd_t *vbd) { if (!list_empty(&vbd->pending_requests)) { td_flag_set(vbd->state, TD_VBD_QUIESCE_REQUESTED); return -EAGAIN; } td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED); td_flag_set(vbd->state, TD_VBD_QUIESCED); return 0; } int tapdisk_vbd_start_queue(td_vbd_t *vbd) { td_flag_clear(vbd->state, TD_VBD_QUIESCED); td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED); tapdisk_vbd_mark_progress(vbd); return 0; } int tapdisk_vbd_kill_queue(td_vbd_t *vbd) { tapdisk_vbd_quiesce_queue(vbd); td_flag_set(vbd->state, TD_VBD_DEAD); return 0; } #if 0 static int tapdisk_vbd_open_image(td_vbd_t *vbd, td_image_t *image) { int err; td_image_t *parent; err = td_open(image); if (err) return err; if (!tapdisk_vbd_is_last_image(vbd, image)) { parent = tapdisk_vbd_next_image(image); err = td_validate_parent(image, parent); if (err) { td_close(image); return err; } } return 0; } #endif int tapdisk_vbd_pause(td_vbd_t *vbd) { int err; DBG(TLOG_DBG, "pause requested\n"); td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED); err = tapdisk_vbd_quiesce_queue(vbd); if (err) return err; tapdisk_vbd_close_vdi(vbd); DBG(TLOG_DBG, "pause completed\n"); td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED); td_flag_set(vbd->state, TD_VBD_PAUSED); return 0; } int tapdisk_vbd_resume(td_vbd_t *vbd, const char *name) { int i, err; DBG(TLOG_DBG, "resume requested\n"); if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) { EPRINTF("resume request for unpaused vbd %s\n", vbd->name); return -EINVAL; } for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { err = tapdisk_vbd_open_vdi(vbd, name, vbd->flags | TD_OPEN_STRICT, -1); if (!err) break; sleep(TD_VBD_EIO_SLEEP); } if (err) return err; DBG(TLOG_DBG, "resume completed\n"); tapdisk_vbd_start_queue(vbd); td_flag_clear(vbd->state, TD_VBD_PAUSED); td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED); tapdisk_vbd_check_state(vbd); DBG(TLOG_DBG, "state checked\n"); return 0; } static int tapdisk_vbd_request_ttl(td_vbd_request_t *vreq, const struct timeval *now) { struct timeval delta; timersub(now, &vreq->ts, &delta); return TD_VBD_REQUEST_TIMEOUT - delta.tv_sec; } static int __tapdisk_vbd_request_timeout(td_vbd_request_t *vreq, const struct timeval *now) { int timeout; timeout = tapdisk_vbd_request_ttl(vreq, now) < 0; if (timeout) ERR(vreq->error, "req %s timed out, retried %d times\n", vreq->name, vreq->num_retries); return timeout; } static int tapdisk_vbd_request_timeout(td_vbd_request_t *vreq) { struct timeval now; gettimeofday(&now, NULL); return __tapdisk_vbd_request_timeout(vreq, &now); } static void tapdisk_vbd_check_queue_state(td_vbd_t *vbd) { td_vbd_request_t *vreq, *tmp; struct timeval now; gettimeofday(&now, NULL); tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) if (__tapdisk_vbd_request_timeout(vreq, &now)) tapdisk_vbd_complete_vbd_request(vbd, vreq); if (!list_empty(&vbd->new_requests) || !list_empty(&vbd->failed_requests)) tapdisk_vbd_issue_requests(vbd); } void tapdisk_vbd_check_state(td_vbd_t *vbd) { tapdisk_vbd_check_queue_state(vbd); if (td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED)) tapdisk_vbd_quiesce_queue(vbd); if (td_flag_test(vbd->state, TD_VBD_PAUSE_REQUESTED)) tapdisk_vbd_pause(vbd); if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED)) tapdisk_vbd_close(vbd); } void tapdisk_vbd_check_progress(td_vbd_t *vbd) { time_t diff; struct timeval now, delta; if (list_empty(&vbd->pending_requests)) return; gettimeofday(&now, NULL); timersub(&now, &vbd->ts, &delta); diff = delta.tv_sec; if (diff >= TD_VBD_WATCHDOG_TIMEOUT && tapdisk_vbd_queue_ready(vbd)) { DBG(TLOG_WARN, "%s: watchdog timeout: pending requests " "idle for %ld seconds\n", vbd->name, diff); tapdisk_vbd_drop_log(vbd); return; } tapdisk_server_set_max_timeout(TD_VBD_WATCHDOG_TIMEOUT - diff); } /* * request submission */ static int tapdisk_vbd_check_queue(td_vbd_t *vbd) { if (list_empty(&vbd->images)) return -ENOSYS; if (!tapdisk_vbd_queue_ready(vbd)) return -EAGAIN; return 0; } static int tapdisk_vbd_request_should_retry(td_vbd_t *vbd, td_vbd_request_t *vreq) { if (td_flag_test(vbd->state, TD_VBD_DEAD) || td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED)) return 0; switch (abs(vreq->error)) { case EPERM: case ENOSYS: case ESTALE: case ENOSPC: return 0; } if (tapdisk_vbd_request_timeout(vreq)) return 0; return 1; } static void tapdisk_vbd_complete_vbd_request(td_vbd_t *vbd, td_vbd_request_t *vreq) { if (!vreq->submitting && !vreq->secs_pending) { if (vreq->error && tapdisk_vbd_request_should_retry(vbd, vreq)) tapdisk_vbd_move_request(vreq, &vbd->failed_requests); else tapdisk_vbd_move_request(vreq, &vbd->completed_requests); } } static void FIXME_maybe_count_enospc_redirect(td_vbd_t *vbd, td_request_t treq) { int write = treq.op == TD_OP_WRITE; if (write && treq.image == tapdisk_vbd_first_image(vbd) && vbd->FIXME_enospc_redirect_count_enabled) vbd->FIXME_enospc_redirect_count += treq.secs; } static void __tapdisk_vbd_complete_td_request(td_vbd_t *vbd, td_vbd_request_t *vreq, td_request_t treq, int res) { td_image_t *image = treq.image; int err; err = (res <= 0 ? res : -res); vbd->secs_pending -= treq.secs; vreq->secs_pending -= treq.secs; if (err != -EBUSY) { int write = treq.op == TD_OP_WRITE; td_sector_count_add(&image->stats.hits, treq.secs, write); if (err) td_sector_count_add(&image->stats.fail, treq.secs, write); FIXME_maybe_count_enospc_redirect(vbd, treq); } if (err) { if (err != -EBUSY) { if (!vreq->error && err != vreq->prev_error) tlog_drv_error(image->driver, err, "req %s: %s 0x%04x secs @ 0x%08"PRIx64, vreq->name, (treq.op == TD_OP_WRITE ? "write" : "read"), treq.secs, treq.sec); vbd->errors++; } vreq->error = (vreq->error ? : err); } tapdisk_vbd_complete_vbd_request(vbd, vreq); } static void __tapdisk_vbd_reissue_td_request(td_vbd_t *vbd, td_image_t *image, td_request_t treq) { td_image_t *parent; td_vbd_request_t *vreq; vreq = treq.vreq; gettimeofday(&vreq->last_try, NULL); vreq->submitting++; if (tapdisk_vbd_is_last_image(vbd, image)) { memset(treq.buf, 0, treq.secs << SECTOR_SHIFT); td_complete_request(treq, 0); goto done; } parent = tapdisk_vbd_next_image(image); treq.image = parent; /* return zeros for requests that extend beyond end of parent image */ if (treq.sec + treq.secs > parent->info.size) { td_request_t clone = treq; if (parent->info.size > treq.sec) { int secs = parent->info.size - treq.sec; clone.sec += secs; clone.secs -= secs; clone.buf += (secs << SECTOR_SHIFT); treq.secs = secs; } else treq.secs = 0; memset(clone.buf, 0, clone.secs << SECTOR_SHIFT); td_complete_request(clone, 0); if (!treq.secs) goto done; } switch (treq.op) { case TD_OP_WRITE: td_queue_write(parent, treq); break; case TD_OP_READ: td_queue_read(parent, treq); break; } done: vreq->submitting--; if (!vreq->secs_pending) tapdisk_vbd_complete_vbd_request(vbd, vreq); } void tapdisk_vbd_forward_request(td_request_t treq) { td_vbd_t *vbd; td_image_t *image; td_vbd_request_t *vreq; image = treq.image; vreq = treq.vreq; vbd = vreq->vbd; tapdisk_vbd_mark_progress(vbd); if (tapdisk_vbd_queue_ready(vbd)) __tapdisk_vbd_reissue_td_request(vbd, image, treq); else __tapdisk_vbd_complete_td_request(vbd, vreq, treq, -EBUSY); } void tapdisk_vbd_complete_td_request(td_request_t treq, int res) { td_vbd_t *vbd; td_image_t *image, *leaf; td_vbd_request_t *vreq; image = treq.image; vreq = treq.vreq; vbd = vreq->vbd; tapdisk_vbd_mark_progress(vbd); if (abs(res) == ENOSPC && td_flag_test(image->flags, TD_IGNORE_ENOSPC)) { res = 0; leaf = tapdisk_vbd_first_image(vbd); if (vbd->secondary_mode == TD_VBD_SECONDARY_MIRROR) { DPRINTF("ENOSPC: disabling mirroring\n"); list_del_init(&leaf->next); vbd->retired = leaf; } else if (vbd->secondary_mode == TD_VBD_SECONDARY_STANDBY) { DPRINTF("ENOSPC: failing over to secondary image\n"); list_add(&vbd->secondary->next, leaf->next.prev); vbd->FIXME_enospc_redirect_count_enabled = 1; } if (vbd->secondary_mode != TD_VBD_SECONDARY_DISABLED) { vbd->secondary = NULL; vbd->secondary_mode = TD_VBD_SECONDARY_DISABLED; signal_enospc(vbd); } } DBG(TLOG_DBG, "%s: req %s seg %d sec 0x%08"PRIx64 " secs 0x%04x buf %p op %d res %d\n", image->name, vreq->name, treq.sidx, treq.sec, treq.secs, treq.buf, vreq->op, res); __tapdisk_vbd_complete_td_request(vbd, vreq, treq, res); } static inline void queue_mirror_req(td_vbd_t *vbd, td_request_t clone) { clone.image = vbd->secondary; td_queue_write(vbd->secondary, clone); } static int tapdisk_vbd_issue_request(td_vbd_t *vbd, td_vbd_request_t *vreq) { td_image_t *image; td_request_t treq; td_sector_t sec; int i, err; sec = vreq->sec; image = tapdisk_vbd_first_image(vbd); vreq->submitting = 1; tapdisk_vbd_mark_progress(vbd); vreq->last_try = vbd->ts; tapdisk_vbd_move_request(vreq, &vbd->pending_requests); err = tapdisk_vbd_check_queue(vbd); if (err) { vreq->error = err; goto fail; } err = tapdisk_image_check_request(image, vreq); if (err) { vreq->error = err; goto fail; } for (i = 0; i < vreq->iovcnt; i++) { struct td_iovec *iov = &vreq->iov[i]; treq.sidx = i; treq.buf = iov->base; treq.sec = sec; treq.secs = iov->secs; treq.image = image; treq.cb = tapdisk_vbd_complete_td_request; treq.cb_data = NULL; treq.vreq = vreq; vreq->secs_pending += iov->secs; vbd->secs_pending += iov->secs; if (vbd->secondary_mode == TD_VBD_SECONDARY_MIRROR && vreq->op == TD_OP_WRITE) { vreq->secs_pending += iov->secs; vbd->secs_pending += iov->secs; } switch (vreq->op) { case TD_OP_WRITE: treq.op = TD_OP_WRITE; /* it's important to queue the mirror request before queuing * the main one. If the main image runs into ENOSPC, the * mirroring could be disabled before td_queue_write returns, * so if the mirror request was queued after (which would then * not happen), we'd lose that write and cause the process to * hang with unacknowledged writes */ if (vbd->secondary_mode == TD_VBD_SECONDARY_MIRROR) queue_mirror_req(vbd, treq); td_queue_write(treq.image, treq); break; case TD_OP_READ: treq.op = TD_OP_READ; td_queue_read(treq.image, treq); break; } DBG(TLOG_DBG, "%s: req %s seg %d sec 0x%08"PRIx64" secs 0x%04x " "buf %p op %d\n", image->name, vreq->name, i, treq.sec, treq.secs, treq.buf, vreq->op); sec += iov->secs; } err = 0; out: vreq->submitting--; if (!vreq->secs_pending) { err = (err ? : vreq->error); tapdisk_vbd_complete_vbd_request(vbd, vreq); } return err; fail: vreq->error = err; goto out; } static int tapdisk_vbd_request_completed(td_vbd_t *vbd, td_vbd_request_t *vreq) { return vreq->list_head == &vbd->completed_requests; } static int tapdisk_vbd_reissue_failed_requests(td_vbd_t *vbd) { int err; struct timeval now; td_vbd_request_t *vreq, *tmp; err = 0; gettimeofday(&now, NULL); tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) { if (vreq->secs_pending) continue; if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED)) { tapdisk_vbd_complete_vbd_request(vbd, vreq); continue; } if (vreq->error != -EBUSY && now.tv_sec - vreq->last_try.tv_sec < TD_VBD_RETRY_INTERVAL) continue; vbd->retries++; vreq->num_retries++; vreq->prev_error = vreq->error; vreq->error = 0; DBG(TLOG_DBG, "retry #%d of req %s, " "sec 0x%08"PRIx64", iovcnt: %d\n", vreq->num_retries, vreq->name, vreq->sec, vreq->iovcnt); err = tapdisk_vbd_issue_request(vbd, vreq); /* * if this request failed, but was not completed, * we'll back off for a while. */ if (err && !tapdisk_vbd_request_completed(vbd, vreq)) break; } return 0; } static void tapdisk_vbd_count_new_request(td_vbd_t *vbd, td_vbd_request_t *vreq) { struct td_iovec *iov; int write; write = vreq->op == TD_OP_WRITE; for (iov = &vreq->iov[0]; iov < &vreq->iov[vreq->iovcnt]; iov++) td_sector_count_add(&vbd->secs, iov->secs, write); } static int tapdisk_vbd_issue_new_requests(td_vbd_t *vbd) { int err; td_vbd_request_t *vreq, *tmp; tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) { err = tapdisk_vbd_issue_request(vbd, vreq); /* * if this request failed, but was not completed, * we'll back off for a while. */ if (err && !tapdisk_vbd_request_completed(vbd, vreq)) return err; tapdisk_vbd_count_new_request(vbd, vreq); } return 0; } int tapdisk_vbd_recheck_state(td_vbd_t *vbd) { if (list_empty(&vbd->new_requests)) return 0; if (td_flag_test(vbd->state, TD_VBD_QUIESCED) || td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED)) return 0; tapdisk_vbd_issue_new_requests(vbd); return 1; } static int tapdisk_vbd_kill_requests(td_vbd_t *vbd) { td_vbd_request_t *vreq, *tmp; tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) { vreq->error = -ESHUTDOWN; tapdisk_vbd_move_request(vreq, &vbd->completed_requests); } tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) { vreq->error = -ESHUTDOWN; tapdisk_vbd_move_request(vreq, &vbd->completed_requests); } return 0; } int tapdisk_vbd_issue_requests(td_vbd_t *vbd) { int err; if (td_flag_test(vbd->state, TD_VBD_DEAD)) return tapdisk_vbd_kill_requests(vbd); if (td_flag_test(vbd->state, TD_VBD_QUIESCED) || td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED)) return -EAGAIN; err = tapdisk_vbd_reissue_failed_requests(vbd); if (err) return err; return tapdisk_vbd_issue_new_requests(vbd); } int tapdisk_vbd_queue_request(td_vbd_t *vbd, td_vbd_request_t *vreq) { gettimeofday(&vreq->ts, NULL); vreq->vbd = vbd; list_add_tail(&vreq->next, &vbd->new_requests); vbd->received++; return 0; } void tapdisk_vbd_kick(td_vbd_t *vbd) { const struct list_head *list = &vbd->completed_requests; td_vbd_request_t *vreq, *prev, *next; vbd->kicked++; while (!list_empty(list)) { prev = list_entry(list->next, td_vbd_request_t, next); list_del(&prev->next); tapdisk_vbd_for_each_request(vreq, next, list) { if (vreq->token == prev->token) { prev->cb(prev, prev->error, prev->token, 0); vbd->returned++; list_del(&vreq->next); prev = vreq; } } prev->cb(prev, prev->error, prev->token, 1); vbd->returned++; } } void tapdisk_vbd_stats(td_vbd_t *vbd, td_stats_t *st) { td_image_t *image, *next; tapdisk_stats_enter(st, '{'); tapdisk_stats_field(st, "name", "s", vbd->name); tapdisk_stats_field(st, "secs", "["); tapdisk_stats_val(st, "llu", vbd->secs.rd); tapdisk_stats_val(st, "llu", vbd->secs.wr); tapdisk_stats_leave(st, ']'); tapdisk_stats_field(st, "images", "["); tapdisk_vbd_for_each_image(vbd, image, next) tapdisk_image_stats(image, st); tapdisk_stats_leave(st, ']'); if (vbd->tap) { tapdisk_stats_field(st, "tap", "{"); tapdisk_blktap_stats(vbd->tap, st); tapdisk_stats_leave(st, '}'); } tapdisk_stats_field(st, "FIXME_enospc_redirect_count", "llu", vbd->FIXME_enospc_redirect_count); tapdisk_stats_leave(st, '}'); } blktap-2.0.90/drivers/block-vindex.c0000644000000000000000000005145511664745551016014 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include "tapdisk.h" #include "tapdisk-utils.h" #include "tapdisk-driver.h" #include "tapdisk-server.h" #include "tapdisk-interface.h" #include "libvhd.h" #include "libvhd-index.h" #define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a) #define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) #define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a) #define ASSERT(condition) \ if (!(condition)) { \ WARN("FAILED ASSERTION: '%s'\n", #condition); \ td_panic(); \ } #define VHD_INDEX_FILE_POOL_SIZE 12 #define VHD_INDEX_CACHE_SIZE 4 #define VHD_INDEX_REQUESTS (TAPDISK_DATA_REQUESTS + VHD_INDEX_CACHE_SIZE) #define VHD_INDEX_BLOCK_READ_PENDING 0x0001 #define VHD_INDEX_BLOCK_VALID 0x0002 #define VHD_INDEX_BAT_CLEAR 0 #define VHD_INDEX_BIT_CLEAR 1 #define VHD_INDEX_BIT_SET 2 #define VHD_INDEX_CACHE_MISS 3 #define VHD_INDEX_META_READ_PENDING 4 typedef struct vhd_index vhd_index_t; typedef struct vhd_index_block vhd_index_block_t; typedef struct vhd_index_request vhd_index_request_t; typedef struct vhd_index_file_ref vhd_index_file_ref_t; struct vhd_index_request { off64_t off; td_request_t treq; vhd_index_t *index; struct tiocb tiocb; struct list_head next; vhd_index_file_ref_t *file; }; struct vhd_index_block { uint64_t blk; uint32_t seqno; td_flag_t state; vhdi_block_t vhdi_block; int table_size; struct list_head queue; vhd_index_request_t req; }; struct vhd_index_file_ref { int fd; vhdi_file_id_t fid; uint32_t seqno; uint32_t refcnt; }; struct vhd_index { char *name; vhdi_bat_t bat; vhdi_context_t vhdi; vhdi_file_table_t files; vhd_index_file_ref_t fds[VHD_INDEX_FILE_POOL_SIZE]; vhd_index_block_t *cache[VHD_INDEX_CACHE_SIZE]; int cache_free_cnt; vhd_index_block_t *cache_free_list[VHD_INDEX_CACHE_SIZE]; vhd_index_block_t cache_list[VHD_INDEX_CACHE_SIZE]; int requests_free_cnt; vhd_index_request_t *requests_free_list[VHD_INDEX_REQUESTS]; vhd_index_request_t requests_list[VHD_INDEX_REQUESTS]; td_driver_t *driver; }; static void vhd_index_complete_meta_read(void *, struct tiocb *, int); static void vhd_index_complete_data_read(void *, struct tiocb *, int); #define vhd_index_block_for_each_request(_block, _req, _tmp) \ list_for_each_entry_safe((_req), (_tmp), &(_block)->queue, next) static inline void vhd_index_initialize_request(vhd_index_request_t *req) { memset(req, 0, sizeof(vhd_index_request_t)); INIT_LIST_HEAD(&req->next); } static inline void vhd_index_initialize_block(vhd_index_block_t *block) { block->blk = 0; block->state = 0; INIT_LIST_HEAD(&block->queue); vhd_index_initialize_request(&block->req); memset(block->vhdi_block.table, 0, block->table_size); } static void vhd_index_init(vhd_index_t *index) { int i; memset(index, 0, sizeof(vhd_index_t)); index->cache_free_cnt = VHD_INDEX_CACHE_SIZE; for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { index->cache_free_list[i] = index->cache_list + i; vhd_index_initialize_block(index->cache_free_list[i]); } index->requests_free_cnt = VHD_INDEX_REQUESTS; for (i = 0; i < VHD_INDEX_REQUESTS; i++) { index->requests_free_list[i] = index->requests_list + i; vhd_index_initialize_request(index->requests_free_list[i]); } for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++) index->fds[i].fd = -1; } static int vhd_index_allocate_cache(vhd_index_t *index) { void *buf; int i, err; size_t size; size = vhd_bytes_padded(index->vhdi.spb * sizeof(vhdi_entry_t)); for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); if (err) goto fail; memset(buf, 0, size); index->cache_list[i].vhdi_block.table = (vhdi_entry_t *)buf; index->cache_list[i].vhdi_block.entries = index->vhdi.spb; index->cache_list[i].table_size = size; } return 0; fail: for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { free(index->cache_list[i].vhdi_block.table); index->cache_list[i].vhdi_block.table = NULL; } return -ENOMEM; } static void vhd_index_free(vhd_index_t *index) { int i; for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) free(index->cache_list[i].vhdi_block.table); for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++) if (index->fds[i].fd != -1) close(index->fds[i].fd); vhdi_file_table_free(&index->files); free(index->bat.table); free(index->name); } static int vhd_index_load(vhd_index_t *index) { int err; err = vhdi_bat_load(index->name, &index->bat); if (err) return err; err = vhdi_open(&index->vhdi, index->bat.index_path, O_RDONLY | O_DIRECT | O_LARGEFILE); if (err) goto fail; err = vhdi_file_table_load(index->bat.file_table_path, &index->files); if (err) { vhdi_close(&index->vhdi); goto fail; } return 0; fail: free(index->bat.table); memset(&index->bat, 0, sizeof(vhdi_bat_t)); memset(&index->vhdi, 0, sizeof(vhdi_context_t)); memset(&index->files, 0, sizeof(vhdi_file_table_t)); return err; } static int vhd_index_open(td_driver_t *driver, const char *name, td_flag_t flags) { int err; vhd_index_t *index; index = (vhd_index_t *)driver->data; vhd_index_init(index); index->name = strdup(name); if (!index->name) return -ENOMEM; err = vhd_index_load(index); if (err) { free(index->name); return err; } err = vhd_index_allocate_cache(index); if (err) { vhd_index_free(index); return err; } driver->info.size = index->bat.vhd_blocks * index->bat.vhd_block_size; driver->info.sector_size = VHD_SECTOR_SIZE; driver->info.info = 0; index->driver = driver; DPRINTF("opened vhd index %s\n", name); return 0; } static int vhd_index_close(td_driver_t *driver) { vhd_index_t *index; index = (vhd_index_t *)driver->data; vhdi_close(&index->vhdi); DPRINTF("closed vhd index %s\n", index->name); vhd_index_free(index); return 0; } static inline void vhd_index_touch_file_ref(vhd_index_t *index, vhd_index_file_ref_t *ref) { int i; if (++ref->seqno == 0xFFFFFFFF) for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++) index->fds[i].seqno >>= 1; } static inline void vhd_index_get_file_ref(vhd_index_file_ref_t *ref) { ++ref->refcnt; } static inline void vhd_index_put_file_ref(vhd_index_file_ref_t *ref) { --ref->refcnt; } static inline vhd_index_file_ref_t * vhd_index_find_lru_file_ref(vhd_index_t *index) { int i; uint32_t min; vhd_index_file_ref_t *lru; lru = NULL; min = (uint32_t)-1; for (i = 1; i < VHD_INDEX_FILE_POOL_SIZE; i++) { if (index->fds[i].refcnt) continue; if (!lru || index->fds[i].seqno < min) { min = index->fds[i].seqno; lru = index->fds + i; } } return lru; } static inline int vhd_index_open_file(vhd_index_t *index, vhdi_file_id_t id, vhd_index_file_ref_t *ref) { int i; char *path; path = NULL; for (i = 0; i < index->files.entries; i++) if (index->files.table[i].file_id == id) { path = index->files.table[i].path; break; } if (!path) return -ENOENT; ref->fd = open(path, O_RDONLY | O_DIRECT | O_LARGEFILE); if (ref->fd == -1) return -errno; ref->fid = id; ref->refcnt = 0; return 0; } static int vhd_index_get_file(vhd_index_t *index, vhdi_file_id_t id, vhd_index_file_ref_t **ref) { int i, err; vhd_index_file_ref_t *lru; *ref = NULL; for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++) if (id == index->fds[i].fid) { *ref = index->fds + i; vhd_index_touch_file_ref(index, *ref); vhd_index_get_file_ref(*ref); return 0; } lru = vhd_index_find_lru_file_ref(index); if (!lru) return -EBUSY; if (lru->fd != -1) close(lru->fd); err = vhd_index_open_file(index, id, lru); if (err) goto fail; vhd_index_touch_file_ref(index, lru); vhd_index_get_file_ref(lru); *ref = lru; return 0; fail: lru->fd = -1; lru->fid = 0; lru->refcnt = 0; return err; } static inline vhd_index_request_t * vhd_index_allocate_request(vhd_index_t *index) { vhd_index_request_t *req; if (index->requests_free_cnt <= 0) return NULL; req = index->requests_free_list[--index->requests_free_cnt]; ASSERT(!req->index); return req; } static inline void vhd_index_free_request(vhd_index_t *index, vhd_index_request_t *req) { list_del(&req->next); vhd_index_initialize_request(req); index->requests_free_list[index->requests_free_cnt++] = req; } static inline int vhd_index_block_valid(vhd_index_block_t *block) { return (!td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING) && td_flag_test(block->state, VHD_INDEX_BLOCK_VALID)); } static inline void vhd_index_touch_block(vhd_index_t *index, vhd_index_block_t *block) { int i; if (++block->seqno == 0xFFFFFFFF) for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) index->cache_list[i].seqno >>= 1; } static inline vhd_index_block_t * vhd_index_get_lru_block(vhd_index_t *index) { int i, idx; uint32_t min; vhd_index_block_t *block, *lru; lru = NULL; min = (uint32_t)-1; idx = 0; for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { block = index->cache[i]; if (!block) continue; if (td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING)) continue; if (!lru || block->seqno < min) { lru = block; min = block->seqno; idx = i; } } if (lru) index->cache[idx] = NULL; return lru; } static inline int vhd_index_allocate_block(vhd_index_t *index, vhd_index_block_t **block) { vhd_index_block_t *b; *block = NULL; if (index->cache_free_cnt > 0) b = index->cache_free_list[--index->cache_free_cnt]; else { b = vhd_index_get_lru_block(index); if (!b) return -EBUSY; } vhd_index_initialize_block(b); vhd_index_touch_block(index, b); *block = b; return 0; } static int vhd_index_install_block(vhd_index_t *index, vhd_index_block_t **block, uint32_t blk) { int i, err; vhd_index_block_t *b; *block = NULL; err = vhd_index_allocate_block(index, &b); if (err) return err; b->blk = blk; for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) if (!index->cache[i]) { index->cache[i] = b; break; } ASSERT(i < VHD_INDEX_CACHE_SIZE); *block = b; return 0; } static inline vhd_index_block_t * vhd_index_get_block(vhd_index_t *index, uint32_t blk) { int i; vhd_index_block_t *block; for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { block = index->cache[i]; if (!block) continue; if (block->blk == blk) return block; } return NULL; } static int vhd_index_read_cache(vhd_index_t *index, uint64_t sector) { uint32_t blk, sec; vhd_index_block_t *block; blk = sector / index->vhdi.spb; if (blk >= index->bat.vhd_blocks) return -EINVAL; if (index->bat.table[blk] == DD_BLK_UNUSED) return VHD_INDEX_BAT_CLEAR; block = vhd_index_get_block(index, blk); if (!block) return VHD_INDEX_CACHE_MISS; vhd_index_touch_block(index, block); if (td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING)) return VHD_INDEX_META_READ_PENDING; sec = sector % index->vhdi.spb; if (block->vhdi_block.table[sec].offset == DD_BLK_UNUSED) return VHD_INDEX_BIT_CLEAR; return VHD_INDEX_BIT_SET; } static int vhd_index_read_cache_span(vhd_index_t *index, uint64_t sector, int secs, int value) { int i; uint32_t blk, sec; vhd_index_block_t *block; blk = sector / index->vhdi.spb; sec = sector % index->vhdi.spb; ASSERT(blk < index->bat.vhd_blocks); block = vhd_index_get_block(index, blk); ASSERT(block && vhd_index_block_valid(block)); for (i = 0; i < secs && i + sec < index->vhdi.spb; i++) if (value ^ (block->vhdi_block.table[sec + i].offset != DD_BLK_UNUSED)) break; return i; } static int vhd_index_schedule_meta_read(vhd_index_t *index, uint32_t blk) { int err; off64_t offset; vhd_index_block_t *block; vhd_index_request_t *req; ASSERT(index->bat.table[blk] != DD_BLK_UNUSED); block = vhd_index_get_block(index, blk); if (!block) { err = vhd_index_install_block(index, &block, blk); if (err) return err; } offset = vhd_sectors_to_bytes(index->bat.table[blk]); req = &block->req; req->index = index; req->treq.sec = blk * index->vhdi.spb; req->treq.secs = block->table_size >> VHD_SECTOR_SHIFT; td_prep_read(&req->tiocb, index->vhdi.fd, (char *)block->vhdi_block.table, block->table_size, offset, vhd_index_complete_meta_read, req); td_queue_tiocb(index->driver, &req->tiocb); td_flag_set(block->state, VHD_INDEX_BLOCK_READ_PENDING); return 0; } static int vhd_index_schedule_data_read(vhd_index_t *index, td_request_t treq) { int i, err; size_t size; off64_t offset; uint32_t blk, sec; vhd_index_block_t *block; vhd_index_request_t *req; vhd_index_file_ref_t *file; blk = treq.sec / index->vhdi.spb; sec = treq.sec % index->vhdi.spb; block = vhd_index_get_block(index, blk); ASSERT(block && vhd_index_block_valid(block)); for (i = 0; i < treq.secs; i++) { ASSERT(block->vhdi_block.table[sec + i].file_id != 0); ASSERT(block->vhdi_block.table[sec + i].offset != DD_BLK_UNUSED); } req = vhd_index_allocate_request(index); if (!req) return -EBUSY; err = vhd_index_get_file(index, block->vhdi_block.table[sec].file_id, &file); if (err) { vhd_index_free_request(index, req); return err; } size = vhd_sectors_to_bytes(treq.secs); offset = vhd_sectors_to_bytes(block->vhdi_block.table[sec].offset); req->file = file; req->treq = treq; req->index = index; req->off = offset; td_prep_read(&req->tiocb, file->fd, treq.buf, size, offset, vhd_index_complete_data_read, req); td_queue_tiocb(index->driver, &req->tiocb); return 0; } static int vhd_index_queue_request(vhd_index_t *index, td_request_t treq) { vhd_index_block_t *block; vhd_index_request_t *req; req = vhd_index_allocate_request(index); if (!req) return -EBUSY; req->treq = treq; block = vhd_index_get_block(index, treq.sec / index->vhdi.spb); ASSERT(block && td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING)); list_add_tail(&req->next, &block->queue); return 0; } static void vhd_index_queue_read(td_driver_t *driver, td_request_t treq) { vhd_index_t *index; index = (vhd_index_t *)driver->data; while (treq.secs) { int err; td_request_t clone; err = 0; clone = treq; switch (vhd_index_read_cache(index, clone.sec)) { case -EINVAL: err = -EINVAL; goto fail; case VHD_INDEX_BAT_CLEAR: clone.secs = MIN(clone.secs, index->vhdi.spb - (clone.sec % index->vhdi.spb)); td_forward_request(clone); break; case VHD_INDEX_BIT_CLEAR: clone.secs = vhd_index_read_cache_span(index, clone.sec, clone.secs, 0); td_forward_request(clone); break; case VHD_INDEX_BIT_SET: clone.secs = vhd_index_read_cache_span(index, clone.sec, clone.secs, 1); err = vhd_index_schedule_data_read(index, clone); if (err) goto fail; break; case VHD_INDEX_CACHE_MISS: err = vhd_index_schedule_meta_read(index, clone.sec / index->vhdi.spb); if (err) goto fail; clone.secs = MIN(clone.secs, index->vhdi.spb - (clone.sec % index->vhdi.spb)); vhd_index_queue_request(index, clone); break; case VHD_INDEX_META_READ_PENDING: clone.secs = MIN(clone.secs, index->vhdi.spb - (clone.sec % index->vhdi.spb)); err = vhd_index_queue_request(index, clone); if (err) goto fail; break; } treq.sec += clone.secs; treq.secs -= clone.secs; treq.buf += vhd_sectors_to_bytes(clone.secs); continue; fail: clone.secs = treq.secs; td_complete_request(clone, err); break; } } static void vhd_index_queue_write(td_driver_t *driver, td_request_t treq) { td_complete_request(treq, -EPERM); } static inline void vhd_index_signal_completion(vhd_index_t *index, vhd_index_request_t *req, int err) { td_complete_request(req->treq, err); vhd_index_put_file_ref(req->file); vhd_index_free_request(index, req); } static void vhd_index_complete_meta_read(void *arg, struct tiocb *tiocb, int err) { int i; uint32_t blk; td_request_t treq; vhd_index_t *index; vhd_index_block_t *block; vhd_index_request_t *req, *r, *tmp; req = (vhd_index_request_t *)arg; index = req->index; blk = req->treq.sec / index->vhdi.spb; block = vhd_index_get_block(index, blk); ASSERT(block && td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING)); td_flag_clear(block->state, VHD_INDEX_BLOCK_READ_PENDING); if (err) { memset(block->vhdi_block.table, 0, block->table_size); vhd_index_block_for_each_request(block, r, tmp) vhd_index_signal_completion(index, r, err); return; } for (i = 0; i < block->vhdi_block.entries; i++) vhdi_entry_in(block->vhdi_block.table + i); td_flag_set(block->state, VHD_INDEX_BLOCK_VALID); vhd_index_block_for_each_request(block, r, tmp) { treq = r->treq; vhd_index_free_request(index, r); vhd_index_queue_read(index->driver, treq); } } static void vhd_index_complete_data_read(void *arg, struct tiocb *tiocb, int err) { vhd_index_t *index; vhd_index_request_t *req; req = (vhd_index_request_t *)arg; index = req->index; vhd_index_signal_completion(index, req, err); } static int vhd_index_get_parent_id(td_driver_t *driver, td_disk_id_t *id) { return -EINVAL; } static int vhd_index_validate_parent(td_driver_t *driver, td_driver_t *parent, td_flag_t flags) { return -EINVAL; } static void vhd_index_debug(td_driver_t *driver) { int i; vhd_index_t *index; index = (vhd_index_t *)driver->data; WARN("VHD INDEX %s\n", index->name); WARN("FILES:\n"); for (i = 0; i < index->files.entries; i++) { int j, fd, refcnt; fd = -1; refcnt = 0; for (j = 0; j < VHD_INDEX_FILE_POOL_SIZE; j++) if (index->fds[j].fid == index->files.table[i].file_id) { fd = index->fds[j].fd; refcnt = index->fds[j].refcnt; } WARN("%s %u %d %d\n", index->files.table[i].path, index->files.table[i].file_id, fd, refcnt); } WARN("REQUESTS:\n"); for (i = 0; i < VHD_INDEX_REQUESTS; i++) { vhd_index_request_t *req; req = index->requests_list + i; if (!req->index) continue; WARN("%d: buf: %p, sec: 0x%08"PRIx64", secs: 0x%04x, " "fid: %u, off: 0x%016"PRIx64"\n", i, req->treq.buf, req->treq.sec, req->treq.secs, req->file->fid, req->off); } WARN("BLOCKS:\n"); for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { int queued; vhd_index_block_t *block; vhd_index_request_t *req, *tmp; queued = 0; block = index->cache[i]; if (!block) continue; vhd_index_block_for_each_request(block, req, tmp) ++queued; WARN("%d: blk: 0x%08"PRIx64", state: 0x%08x, queued: %d\n", i, block->blk, block->state, queued); } } struct tap_disk tapdisk_vhd_index = { .disk_type = "tapdisk_vhd_index", .flags = 0, .private_data_size = sizeof(vhd_index_t), .td_open = vhd_index_open, .td_close = vhd_index_close, .td_queue_read = vhd_index_queue_read, .td_queue_write = vhd_index_queue_write, .td_get_parent_id = vhd_index_get_parent_id, .td_validate_parent = vhd_index_validate_parent, .td_debug = vhd_index_debug, }; blktap-2.0.90/drivers/io-optimize.h0000644000000000000000000000200111664745551015660 0ustar rootroot/* Copyright (c) 2007, XenSource Inc. * All rights reserved. */ #ifndef __IO_OPTIMIZE_H__ #define __IO_OPTIMIZE_H__ #include struct opio; struct opio_list { struct opio *head; struct opio *tail; }; struct opio { char *buf; unsigned long nbytes; long long offset; void *data; struct iocb *iocb; struct io_event event; struct opio *head; struct opio *next; struct opio_list list; }; struct opioctx { int num_opios; int free_opio_cnt; struct opio *opios; struct opio **free_opios; struct iocb **iocb_queue; struct io_event *event_queue; }; int opio_init(struct opioctx *ctx, int num_iocbs); void opio_free(struct opioctx *ctx); int io_merge(struct opioctx *ctx, struct iocb **queue, int num); int io_split(struct opioctx *ctx, struct io_event *events, int num); int io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num); #endif blktap-2.0.90/drivers/block-ram.c0000644000000000000000000001637611664745551015301 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include "tapdisk.h" #include "tapdisk-driver.h" #include "tapdisk-interface.h" void *img; long int disksector_size; long int disksize; long int diskinfo; static int connections = 0; struct tdram_state { int fd; }; /*Get Image size, secsize*/ static int get_image_info(int fd, td_disk_info_t *info) { int ret; struct stat stat; ret = fstat(fd, &stat); if (ret != 0) { DPRINTF("ERROR: fstat failed, Couldn't stat image"); return -EINVAL; } if (S_ISBLK(stat.st_mode)) { /*Accessing block device directly*/ info->size = 0; if (ioctl(fd,BLKGETSIZE,&info->size)!=0) { DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image"); return -EINVAL; } DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " "sector_shift [%llu]\n", (long long unsigned)(info->size << SECTOR_SHIFT), (long long unsigned)info->size); /*Get the sector size*/ #if defined(BLKSSZGET) { info->sector_size = DEFAULT_SECTOR_SIZE; ioctl(fd, BLKSSZGET, &info->sector_size); if (info->sector_size != DEFAULT_SECTOR_SIZE) DPRINTF("Note: sector size is %ld (not %d)\n", info->sector_size, DEFAULT_SECTOR_SIZE); } #else info->sector_size = DEFAULT_SECTOR_SIZE; #endif } else { /*Local file? try fstat instead*/ info->size = (stat.st_size >> SECTOR_SHIFT); info->sector_size = DEFAULT_SECTOR_SIZE; DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " "sector_shift [%llu]\n", (long long unsigned)(info->size << SECTOR_SHIFT), (long long unsigned)info->size); } if (info->size == 0) { info->size =((uint64_t) MAX_RAMDISK_SIZE); info->sector_size = DEFAULT_SECTOR_SIZE; } info->info = 0; /*Store variables locally*/ disksector_size = info->sector_size; disksize = info->size; diskinfo = info->info; DPRINTF("Image sector_size: \n\t[%lu]\n", info->sector_size); return 0; } /* Open the disk file and initialize ram state. */ int tdram_open (td_driver_t *driver, const char *name, td_flag_t flags) { char *p; uint64_t size; int i, fd, ret = 0, count = 0, o_flags; struct tdram_state *prv = (struct tdram_state *)driver->data; connections++; if (connections > 1) { driver->info.sector_size = disksector_size; driver->info.size = disksize; driver->info.info = diskinfo; DPRINTF("Image already open, returning parameters:\n"); DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " "sector_shift [%llu]\n", (long long unsigned)(driver->info.size << SECTOR_SHIFT), (long long unsigned)driver->info.size); DPRINTF("Image sector_size: \n\t[%lu]\n", driver->info.sector_size); prv->fd = -1; goto done; } /* Open the file */ o_flags = O_DIRECT | O_LARGEFILE | ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR); fd = open(name, o_flags); if ((fd == -1) && (errno == EINVAL)) { /* Maybe O_DIRECT isn't supported. */ o_flags &= ~O_DIRECT; fd = open(name, o_flags); if (fd != -1) DPRINTF("WARNING: Accessing image without" "O_DIRECT! (%s)\n", name); } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name); if (fd == -1) { DPRINTF("Unable to open [%s]!\n",name); ret = 0 - errno; goto done; } prv->fd = fd; ret = get_image_info(fd, &driver->info); size = MAX_RAMDISK_SIZE; if (driver->info.size > size) { DPRINTF("Disk exceeds limit, must be less than [%d]MB", (MAX_RAMDISK_SIZE<>20); return -ENOMEM; } /*Read the image into memory*/ if (posix_memalign(&img, DEFAULT_SECTOR_SIZE, driver->info.size << SECTOR_SHIFT)) { DPRINTF("Mem malloc failed\n"); return -errno; } p = img; DPRINTF("Reading %llu bytes.......", (long long unsigned)driver->info.size << SECTOR_SHIFT); for (i = 0; i < driver->info.size; i++) { ret = read(prv->fd, p, driver->info.sector_size); if (ret != driver->info.sector_size) { DPRINTF("ret = %d, errno = %d\n", ret, errno); ret = 0 - errno; break; } else { count += ret; p = img + count; } } DPRINTF("[%d]\n",count); if (count != driver->info.size << SECTOR_SHIFT) { ret = -1; } else { ret = 0; } done: return ret; } void tdram_queue_read(td_driver_t *driver, td_request_t treq) { int size = treq.secs * driver->info.sector_size; uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size; memcpy(treq.buf, img + offset, size); td_complete_request(treq, 0); } void tdram_queue_write(td_driver_t *driver, td_request_t treq) { int size = treq.secs * driver->info.sector_size; uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size; /* We assume that write access is controlled * at a higher level for multiple disks */ memcpy(img + offset, treq.buf, size); td_complete_request(treq, 0); } int tdram_close(td_driver_t *driver) { connections--; return 0; } int tdram_get_parent_id(td_driver_t *driver, td_disk_id_t *id) { return TD_NO_PARENT; } int tdram_validate_parent(td_driver_t *driver, td_driver_t *pdriver, td_flag_t flags) { return -EINVAL; } struct tap_disk tapdisk_ram = { .disk_type = "tapdisk_ram", .flags = 0, .private_data_size = sizeof(struct tdram_state), .td_open = tdram_open, .td_close = tdram_close, .td_queue_read = tdram_queue_read, .td_queue_write = tdram_queue_write, .td_get_parent_id = tdram_get_parent_id, .td_validate_parent = tdram_validate_parent, .td_debug = NULL, }; blktap-2.0.90/drivers/block-llcache.c0000644000000000000000000003347011664745551016107 0ustar rootroot/* * Copyright (c) 2010, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "tapdisk.h" #include "tapdisk-vbd.h" #include "tapdisk-driver.h" #include "tapdisk-interface.h" #include "tapdisk-disktype.h" #define DBG(_f, _a...) tlog_syslog(TLOG_DBG, _f, ##_a) #define INFO(_f, _a...) tlog_syslog(TLOG_INFO, _f, ##_a) #define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f "in %s:%d", \ ##_a, __func__, __LINE__) #define BUG() td_panic() #define BUG_ON(_cond) if (unlikely(_cond)) { td_panic(); } #define WARN_ON(_p) if (unlikely(_cond)) { WARN(_cond); } int ll_write_error(int curr, int error) { if (error && (!curr || curr == -ENOSPC)) return error; return 0; } void ll_log_switch(int type, int error, td_image_t *local, td_image_t *shared) { WARN("WARNING: %s, on %s:%s. Switching to %s:%s.", strerror(-error), tapdisk_disk_types[local->type]->name, local->name, tapdisk_disk_types[shared->type]->name, shared->name); } /* * LLP: Local leaf persistent cache * -- Persistent write caching in local storage. * * VBD * \ * +--r/w--> llp+vhd:/local/leaf * \ * +--r/w--> vhd:/shared/leaf * \ * +--r/o--> vhd:/shared/parent * * We drive two 'leaf' (r/w) images: One LOCAL (i.e. on local storage, * unreliable and prone to out-of-space failures), and one SHARED * (i.e. in shared storage with plenty of physical backing). * * All images are on a linear read chain: LOCAL inherits from SHARED, * which inherits from a shared master image. This filter driver * aggregates LOCAL. SHARED is our immediate parent, forced into R/W * mode. * * Unless LOCAL failed, reads are issued to LOCAL, to save shared * storage bandwidth. In case of failure, SHARED provides continued * VDI consistency. * */ enum { LLP_MIRROR = 1, /* * LLP_MIRROR: * * Writes are mirrored to both LOCAL and SHARED. Reads are * issued to LOCAL. * * Failure to write LOCAL are recoverable. The driver will * transition to LLP_SHARED. * * Failure to write SHARED is irrecoverable, and signaled to * the original issuer. */ LLP_SHARED = 2, /* * LLP_SHARED: * * Writes are issued to SHARED only. As are reads. * * Failure to write SHARED is irrecoverable. */ }; typedef struct llpcache td_llpcache_t; typedef struct llpcache_request td_llpcache_req_t; #define TD_LLPCACHE_MAX_REQ (MAX_REQUESTS*2) struct llpcache_vreq { enum { LOCAL = 0, SHARED = 1 } target; td_vbd_request_t vreq; }; struct llpcache_request { td_request_t treq; struct td_iovec iov; int error; struct llpcache_vreq lvr[2]; unsigned int pending; int mode; }; struct llpcache { td_image_t *local; int mode; td_llpcache_req_t reqv[TD_LLPCACHE_MAX_REQ]; td_llpcache_req_t *free[TD_LLPCACHE_MAX_REQ]; int n_free; }; static td_llpcache_req_t * llpcache_alloc_request(td_llpcache_t *s) { td_llpcache_req_t *req = NULL; if (likely(s->n_free)) req = s->free[--s->n_free]; return req; } static void llpcache_free_request(td_llpcache_t *s, td_llpcache_req_t *req) { BUG_ON(s->n_free >= TD_LLPCACHE_MAX_REQ); s->free[s->n_free++] = req; } static void __llpcache_write_cb(td_vbd_request_t *vreq, int error, void *token, int final) { td_llpcache_t *s = token; struct llpcache_vreq *lvr; td_llpcache_req_t *req; int mask; lvr = containerof(vreq, struct llpcache_vreq, vreq); req = containerof(lvr, td_llpcache_req_t, lvr[lvr->target]); mask = 1U << lvr->target; BUG_ON(!(req->pending & mask)) if (lvr->target == LOCAL && error == -ENOSPC) { td_image_t *shared = containerof(req->treq.image->next.next, td_image_t, next); ll_log_switch(DISK_TYPE_LLPCACHE, error, s->local, shared); s->mode = LLP_SHARED; error = 0; } req->pending &= ~mask; req->error = ll_write_error(req->error, error); if (!req->pending) { /* FIXME: Make sure this won't retry. */ td_complete_request(req->treq, req->error); llpcache_free_request(s, req); } } /* * NB. Write mirroring. Lacking per-image queues, it's still a * hack. But shall do for now: * * 1. Store the treq, thereby blocking the original vreq. * 2. Reissue, as two clone vreqs. One local, one shared. * 3. Clones seen again then get forwarded. * 4. Treq completes after both vreqs. * * We can recognize clones by matching the vreq->token field. */ static int llpcache_requeue_treq(td_llpcache_t *s, td_llpcache_req_t *req, int target) { struct llpcache_vreq *lvr; td_vbd_request_t *vreq; int err; lvr = &req->lvr[target]; lvr->target = target; vreq = &lvr->vreq; vreq->op = TD_OP_WRITE; vreq->sec = req->treq.sec; vreq->iov = &req->iov; vreq->iovcnt = 1; vreq->cb = __llpcache_write_cb; vreq->token = s; err = tapdisk_vbd_queue_request(req->treq.vreq->vbd, vreq); if (err) goto fail; req->pending |= 1UL << target; return 0; fail: req->error = req->error ? : err; return err; } static void llpcache_fork_write(td_llpcache_t *s, td_request_t treq) { td_llpcache_req_t *req; struct td_iovec *iov; int err; req = llpcache_alloc_request(s); if (!req) { td_complete_request(treq, -EBUSY); return; } memset(req, 0, sizeof(req)); req->treq = treq; iov = &req->iov; iov->base = treq.buf; iov->secs = treq.secs; err = llpcache_requeue_treq(s, req, LOCAL); if (err) goto fail; err = llpcache_requeue_treq(s, req, SHARED); if (err) goto fail; return; fail: if (!req->pending) { td_complete_request(treq, req->error); llpcache_free_request(s, req); } } static void llpcache_forward_write(td_llpcache_t *s, td_request_t treq) { const td_vbd_request_t *vreq = treq.vreq; struct llpcache_vreq *lvr; lvr = containerof(vreq, struct llpcache_vreq, vreq); switch (lvr->target) { case SHARED: td_forward_request(treq); break; case LOCAL: td_queue_write(s->local, treq); break; default: BUG(); } } static void llpcache_queue_write(td_driver_t *driver, td_request_t treq) { td_llpcache_t *s = driver->data; if (treq.vreq->token == s) llpcache_forward_write(s, treq); else llpcache_fork_write(s, treq); } static void llpcache_queue_read(td_driver_t *driver, td_request_t treq) { td_llpcache_t *s = driver->data; switch (s->mode) { case LLP_MIRROR: td_queue_read(s->local, treq); break; case LLP_SHARED: td_forward_request(treq); default: BUG(); } } static int llpcache_close(td_driver_t *driver) { td_llpcache_t *s = driver->data; if (s->local) { tapdisk_image_close(s->local); s->local = NULL; } return 0; } static int llpcache_open(td_driver_t *driver, const char *name, td_flag_t flags) { td_llpcache_t *s = driver->data; int i, err; s->mode = LLP_MIRROR; for (i = 0; i < TD_LLPCACHE_MAX_REQ; i++) llpcache_free_request(s, &s->reqv[i]); err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->local); if (err) goto fail; driver->info = s->local->driver->info; return 0; fail: llpcache_close(driver); return err; } static int llcache_get_parent_id(td_driver_t *driver, td_disk_id_t *id) { td_llpcache_t *s = driver->data; int err; err = td_get_parent_id(s->local, id); if (!err) id->flags &= ~TD_OPEN_RDONLY; return err; } static int llcache_validate_parent(td_driver_t *driver, td_driver_t *pdriver, td_flag_t flags) { return -ENOSYS; } struct tap_disk tapdisk_llpcache = { .disk_type = "tapdisk_llpcache", .flags = 0, .private_data_size = sizeof(td_llpcache_t), .td_open = llpcache_open, .td_close = llpcache_close, .td_queue_read = llpcache_queue_read, .td_queue_write = llpcache_queue_write, .td_get_parent_id = llcache_get_parent_id, .td_validate_parent = llcache_validate_parent, }; /* * LLE: Local Leaf Ephemeral Cache * -- Non-persistent write caching in local storage. * * VBD * \ * +--r/w--> lle+vhd:/shared/leaf * \ * +--r/w--> vhd:/local/leaf * \ * +--r/o--> vhd:/shared/parent * * Note that LOCAL and SHARED chain order differs from LLP. Shared * storage data masks local data. * * This means VDI state in shared storage state alone is * inconsistent. Wherever local is unavailable, SHARED must be * discarded too. */ enum { LLE_LOCAL = 1, /* * LLE_LOCAL: * * Writes are forwarded to LOCAL only. As are reads. This * reduces network overhead. * * Failure to write LOCAL is recoverable. The driver will * transition to LLE_SHARED. * * Failure to write to shared are irrecoverable and signaled * to the original issuer. */ LLE_SHARED = 2, /* * LLE_SHARED: * * Writes are issued to SHARED. As are reads. * * Failure to write to SHARED is irrecoverable. */ }; typedef struct llecache td_llecache_t; typedef struct llecache_request td_llecache_req_t; #define TD_LLECACHE_MAX_REQ (MAX_REQUESTS*2) struct llecache_request { td_llecache_t *s; td_request_t treq; int pending; int error; }; struct llecache { td_image_t *shared; int mode; td_llecache_req_t reqv[TD_LLECACHE_MAX_REQ]; td_llecache_req_t *free[TD_LLECACHE_MAX_REQ]; int n_free; }; static td_llecache_req_t * llecache_alloc_request(td_llecache_t *s) { td_llecache_req_t *req = NULL; if (likely(s->n_free)) req = s->free[--s->n_free]; return req; } static void llecache_free_request(td_llecache_t *s, td_llecache_req_t *req) { BUG_ON(s->n_free >= TD_LLECACHE_MAX_REQ); s->free[s->n_free++] = req; } static int llecache_close(td_driver_t *driver) { td_llecache_t *s = driver->data; if (s->shared) { tapdisk_image_close(s->shared); s->shared = NULL; } return 0; } static int llecache_open(td_driver_t *driver, const char *name, td_flag_t flags) { td_llecache_t *s = driver->data; int i, err; s->mode = LLE_LOCAL; for (i = 0; i < TD_LLECACHE_MAX_REQ; i++) llecache_free_request(s, &s->reqv[i]); err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->shared); if (err) goto fail; driver->info = s->shared->driver->info; return 0; fail: llecache_close(driver); return err; } static void __llecache_write_cb(td_request_t treq, int error) { td_llecache_req_t *req = treq.cb_data; td_llecache_t *s = req->s; BUG_ON(req->pending < treq.secs); req->pending -= treq.secs; req->error = ll_write_error(req->error, error); if (req->pending) return; if (req->error == -ENOSPC) { ll_log_switch(DISK_TYPE_LLECACHE, req->error, treq.image, s->shared); s->mode = LLE_SHARED; td_queue_write(s->shared, req->treq); } else td_complete_request(req->treq, error); llecache_free_request(s, req); } static void llecache_forward_write(td_llecache_t *s, td_request_t treq) { td_llecache_req_t *req; td_request_t clone; req = llecache_alloc_request(s); if (!req) { td_complete_request(treq, -EBUSY); return; } memset(req, 0, sizeof(req)); req->treq = treq; req->pending = treq.secs; req->s = s; clone = treq; clone.cb = __llecache_write_cb; clone.cb_data = req; td_forward_request(clone); } static void llecache_queue_write(td_driver_t *driver, td_request_t treq) { td_llecache_t *s = driver->data; switch (s->mode) { case LLE_LOCAL: llecache_forward_write(s, treq); break; case LLE_SHARED: td_queue_write(s->shared, treq); break; } } static void llecache_queue_read(td_driver_t *driver, td_request_t treq) { td_llecache_t *s = driver->data; switch (s->mode) { case LLE_LOCAL: td_forward_request(treq); break; case LLE_SHARED: td_queue_read(s->shared, treq); break; default: BUG(); } } struct tap_disk tapdisk_llecache = { .disk_type = "tapdisk_llecache", .flags = 0, .private_data_size = sizeof(td_llecache_t), .td_open = llecache_open, .td_close = llecache_close, .td_queue_read = llecache_queue_read, .td_queue_write = llecache_queue_write, .td_get_parent_id = llcache_get_parent_id, .td_validate_parent = llcache_validate_parent, }; blktap-2.0.90/drivers/tapdisk-stats.h0000644000000000000000000000462711664745551016226 0ustar rootroot/* * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TAPDISK_STATS_H_ #define _TAPDISK_STATS_H_ #include #define TD_STATS_MAX_DEPTH 8 struct tapdisk_stats_ctx { void *pos; void *buf; size_t size; int n_elem[TD_STATS_MAX_DEPTH]; int depth; }; typedef struct tapdisk_stats_ctx td_stats_t; static inline void tapdisk_stats_init(td_stats_t *st, char *buf, size_t size) { memset(st, 0, sizeof(*st)); st->pos = buf; st->buf = buf; st->size = size; } static inline size_t tapdisk_stats_length(td_stats_t *st) { return st->pos - st->buf; } void tapdisk_stats_enter(td_stats_t *st, char t); void tapdisk_stats_leave(td_stats_t *st, char t); void tapdisk_stats_field(td_stats_t *st, const char *key, const char *conv, ...); void tapdisk_stats_val(td_stats_t *st, const char *conv, ...); #endif /* _TAPDISK_STATS_H_ */ blktap-2.0.90/drivers/tapdisk-storage.h0000644000000000000000000000352011664745551016523 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TAPDISK_STORAGE_H_ #define _TAPDISK_STORAGE_H_ #define TAPDISK_STORAGE_TYPE_NFS 1 #define TAPDISK_STORAGE_TYPE_EXT 2 #define TAPDISK_STORAGE_TYPE_LVM 3 int tapdisk_storage_type(const char *path); const char *tapdisk_storage_name(int type); #endif blktap-2.0.90/drivers/tapdisk-client.c0000644000000000000000000002567611664745551016350 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* client harness for tapdisk log */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include "log.h" #define BDPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a) #define BWPRINTF(_f, _a...) fprintf (stderr, "log: " _f "\n", ## _a) struct writelog { char* shmpath; uint32_t shmsize; void* shm; /* next unprocessed item in the writelog */ void* cur; unsigned int inflight; /* pointer to start and end of free data space for requests */ void* dhd; void* dtl; log_sring_t* sring; log_front_ring_t fring; }; /* bytes free on the data ring */ static inline unsigned int dring_avail(struct writelog* wl) { /* one byte reserved to distinguish empty from full */ if (wl->dhd == wl->dtl) return sdataend(wl->shm) - sdatastart(wl->shm) - 1; if (wl->dhd < wl->dtl) return wl->dtl - wl->dhd - 1; return (sdataend(wl->shm) - wl->dhd) + (wl->dtl - sdatastart(wl->shm)) - 1; } /* advance ring pointer by len bytes */ static inline void* dring_advance(struct writelog* wl, void* start, size_t len) { void* next; int dsz = sdataend(wl->shm) - sdatastart(wl->shm); next = start + (len % dsz); if (next > sdataend(wl->shm)) next -= dsz; return next; } static void usage(void) { fprintf(stderr, "usage: tapdisk-client \n"); } /* returns socket file descriptor */ static int tdctl_open(const char* sockpath) { struct sockaddr_un saddr; int fd; if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { BWPRINTF("error creating socket: %s", strerror(errno)); return -1; } memset(&saddr, 0, sizeof(saddr)); saddr.sun_family = AF_UNIX; memcpy(saddr.sun_path, sockpath, strlen(sockpath)); if (connect(fd, &saddr, sizeof(saddr)) < 0) { BWPRINTF("error connecting to socket %s: %s", sockpath, strerror(errno)); close(fd); return -1; } return fd; } static int ctl_talk(int fd, struct log_ctlmsg* msg, char* rsp, int rsplen) { int rc; if ((rc = write(fd, msg, sizeof(*msg))) < 0) { BWPRINTF("error sending ctl request: %s", strerror(errno)); return -1; } else if (rc < sizeof(*msg)) { BWPRINTF("short ctl write (%d/%zd bytes)", rc, sizeof(*msg)); return -1; } if (!rsplen) return 0; if ((rc = read(fd, rsp, rsplen)) < 0) { BWPRINTF("error reading ctl response: %s", strerror(errno)); return -1; } else if (rc < rsplen) { BWPRINTF("short ctl read (%d/%d bytes)", rc, rsplen); return -1; } return 0; } static int ctl_get_shmem(int fd, struct writelog* wl) { struct log_ctlmsg req; char rsp[CTLRSPLEN_SHMP + 1]; int rc; memset(&req, 0, sizeof(req)); memset(rsp, 0, sizeof(rsp)); memcpy(req.msg, LOGCMD_SHMP, 4); if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_SHMP)) < 0) { BWPRINTF("error getting shared memory parameters"); return -1; } memcpy(&wl->shmsize, rsp, sizeof(wl->shmsize)); wl->shmpath = strdup(rsp + sizeof(wl->shmsize)); BDPRINTF("shared memory parameters: size: %u, path: %s", wl->shmsize, wl->shmpath); return 0; } static void ctlmsg_init(struct log_ctlmsg* msg, const char* cmd) { memset(msg, 0, sizeof(*msg)); memcpy(msg->msg, cmd, 4); } static int ctl_get_writes(int fd) { struct log_ctlmsg req; char rsp[CTLRSPLEN_GET]; int rc; ctlmsg_init(&req, LOGCMD_GET); if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_GET)) < 0) { BWPRINTF("error getting writes"); return -1; } return 0; } static int ctl_peek_writes(int fd) { struct log_ctlmsg req; char rsp[CTLRSPLEN_PEEK]; int rc; ctlmsg_init(&req, LOGCMD_PEEK); if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_PEEK)) < 0) { BWPRINTF("error peeking writes"); return -1; } return 0; } /* submit pending requests */ static int ctl_kick(int fd) { struct log_ctlmsg req; int rc; ctlmsg_init(&req, LOGCMD_KICK); if ((rc = ctl_talk(fd, &req, NULL, 0)) < 0) { BWPRINTF("error kicking ring"); return -1; } return 0; } static int ctl_clear_writes(int fd) { struct log_ctlmsg req; char rsp[CTLRSPLEN_CLEAR]; int rc; ctlmsg_init(&req, LOGCMD_CLEAR); if ((rc = ctl_talk(fd, &req, rsp, CTLRSPLEN_CLEAR)) < 0) { BWPRINTF("error clearing writes"); return -1; } return 0; } static int writelog_map(struct writelog* wl) { int fd; void* shm; if ((fd = shm_open(wl->shmpath, O_RDWR, 0750)) < 0) { BWPRINTF("could not open shared memory at %s: %s", wl->shmpath, strerror(errno)); return -1; } wl->shm = mmap(NULL, wl->shmsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); close(fd); if (wl->shm == MAP_FAILED) { BWPRINTF("could not mmap write log shm: %s", strerror(errno)); return -1; } wl->cur = wl->shm; wl->inflight = 0; wl->dhd = wl->dtl = sdatastart(wl->shm); BDPRINTF("shm cookie: 0x%x, data size: %u", *((uint32_t*)wl->shm), dring_avail(wl)); wl->sring = sringstart(wl->shm); /* need some thought about what to do on reconnect */ FRONT_RING_INIT(&wl->fring, wl->sring, SRINGSIZE); return 0; } static int writelog_dump(struct writelog* wl) { struct disk_range* range = wl->shm; for (range = wl->shm; (void*)range < bmend(wl->shm); range++) { if (!range->count) break; BDPRINTF("dirty extent: %"PRIu64":%u", range->sector, range->count); } return 0; } /* walk dirty map and enqueue read requests. * returns: 0 when entire bitmap has been enqueued, * 1 when the ring is full * -1 on error */ static int writelog_enqueue_requests(struct writelog* wl) { struct disk_range* range = wl->shm; log_request_t* req; for (range = wl->cur; (void*)range < bmend(wl->shm); range++) { if (!range->count) break; if (RING_FULL(&wl->fring)) break; /* insert range into request stream */ /* 1. get next request slot from ring */ /* 2. ensure enough shm space is available */ BDPRINTF("enqueueing dirty extent: %"PRIu64":%u (ring space: %d/%d)", range->sector, range->count, RING_FREE_REQUESTS(&wl->fring), RING_SIZE(&wl->fring)); req = RING_GET_REQUEST(&wl->fring, wl->fring.req_prod_pvt); req->sector = range->sector; req->count = range->count; /* ... */ req->offset = 0; wl->fring.req_prod_pvt++; wl->inflight++; } wl->cur = range; if (range->count) return 1; return 0; } static int writelog_dequeue_responses(struct writelog* wl) { RING_IDX rstart, rend; log_response_t rsp; rstart = wl->fring.rsp_cons; rend = wl->sring->rsp_prod; BDPRINTF("ring kicked (start = %u, end = %u)", rstart, rend); while (rstart != rend) { memcpy(&rsp, RING_GET_RESPONSE(&wl->fring, rstart), sizeof(rsp)); BDPRINTF("ctl: read response %"PRIu64":%u", rsp.sector, rsp.count); wl->fring.rsp_cons = ++rstart; wl->inflight--; } return 0; } static int writelog_free(struct writelog* wl) { if (wl->shmpath) { free(wl->shmpath); wl->shmpath = NULL; } if (wl->shm) { munmap(wl->shm, wl->shmsize); wl->shm = NULL; } return 0; } int get_writes(struct writelog* wl, int fd, int peek) { int rc; if (peek) rc = ctl_peek_writes(fd); else rc = ctl_get_writes(fd); if (rc < 0) return rc; wl->cur = wl->shm; return 0; } int await_responses(struct writelog* wl, int fd) { struct log_ctlmsg msg; int rc; /* sit on socket waiting for kick */ if ((rc = read(fd, &msg, sizeof(msg))) < 0) { BWPRINTF("error reading from control socket: %s", strerror(errno)); return -1; } else if (!rc) { BWPRINTF("EOF on control socket"); return -1; } else if (rc < sizeof(msg)) { BWPRINTF("short reply (%d/%d bytes)", rc, sizeof(msg)); return -1; } if (strncmp(msg.msg, LOGCMD_KICK, 4)) { BWPRINTF("Unknown message received: %.4s", msg.msg); return -1; } if (writelog_dequeue_responses(wl) < 0) return -1; return 0; } /* read_loop: * 1. extract dirty bitmap * 2. feed as much as possible onto ring * 3. kick * 4. as responses come back, feed more of the dirty bitmap * into the ring * 5. when entire bitmap has been queued, go to 1? */ int read_loop(struct writelog* wl, int fd) { int rc; if (get_writes(wl, fd, 1) < 0) return -1; writelog_dump(wl); do { rc = writelog_enqueue_requests(wl); if (RING_FREE_REQUESTS(&wl->fring) < RING_SIZE(&wl->fring)) RING_PUSH_REQUESTS(&wl->fring); if (ctl_kick(fd) < 0) return -1; /* collect responses */ if (wl->inflight && await_responses(wl, fd) < 0) return -1; } while (rc > 0); return rc; } int main(int argc, char* argv[]) { int fd; struct writelog wl; char cmd; if (argc < 2) { usage(); return 1; } if (argc < 3) cmd = 'p'; else cmd = argv[2][0]; fd = tdctl_open(argv[1]); if (ctl_get_shmem(fd, &wl) < 0) return 1; if (writelog_map(&wl) < 0) { BWPRINTF("Error mapping write log: %s", strerror(errno)); return 1; } switch (cmd) { case 'p': if (get_writes(&wl, fd, 1) < 0) return 1; writelog_dump(&wl); break; case 'c': if (ctl_clear_writes(fd) < 0) return 1; break; case 'g': if (get_writes(&wl, fd, 0) < 0) return 1; writelog_dump(&wl); break; case 'r': if (read_loop(&wl, fd) < 0) return 1; break; default: usage(); return 1; } writelog_free(&wl); close(fd); return 0; } blktap-2.0.90/drivers/tapdisk-storage.c0000644000000000000000000000562511664745551016526 0ustar rootroot/* * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "tapdisk-storage.h" #ifndef NFS_SUPER_MAGIC #define NFS_SUPER_MAGIC 0x6969 #endif static int __tapdisk_fs_storage_type(const char *rpath) { struct statfs fst; int type, err; err = statfs(rpath, &fst); if (err) return -errno; switch (fst.f_type) { case NFS_SUPER_MAGIC: type = TAPDISK_STORAGE_TYPE_NFS; break; default: type = TAPDISK_STORAGE_TYPE_EXT; break; } return type; } static int __tapdisk_blk_storage_type(const char *rpath) { return TAPDISK_STORAGE_TYPE_LVM; } int tapdisk_storage_type(const char *path) { char rpath[PATH_MAX], *p; struct stat st; int err, rv; p = realpath(path, rpath); if (!p) return -errno; err = stat(rpath, &st); if (err) return -errno; switch (st.st_mode & S_IFMT) { case S_IFBLK: rv = __tapdisk_blk_storage_type(rpath); break; case S_IFREG: rv = __tapdisk_fs_storage_type(rpath); break; default: rv = -EINVAL; break; } return rv; } const char * tapdisk_storage_name(int type) { switch (type) { case TAPDISK_STORAGE_TYPE_NFS: return "nfs"; case TAPDISK_STORAGE_TYPE_EXT: return "ext"; case TAPDISK_STORAGE_TYPE_LVM: return "lvm"; case -1: return "n/a"; default: return ""; } } blktap-2.0.90/drivers/block-valve.c0000644000000000000000000003427511664745551015635 0ustar rootroot/* * Copyright (c) 2010, Citrix Systems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include "tapdisk.h" #include "tapdisk-driver.h" #include "tapdisk-server.h" #include "tapdisk-interface.h" #include "block-valve.h" typedef struct td_valve td_valve_t; typedef struct td_valve_request td_valve_request_t; struct td_valve_request { td_request_t treq; int secs; struct list_head entry; td_valve_t *valve; }; struct td_valve_stats { unsigned long long stor; unsigned long long forw; }; struct td_valve { char *brname; unsigned long flags; int sock; event_id_t sock_id; event_id_t sched_id; event_id_t retry_id; unsigned int cred; unsigned int need; unsigned int done; struct list_head stor; struct list_head forw; td_valve_request_t reqv[MAX_REQUESTS]; td_valve_request_t *free[MAX_REQUESTS]; int n_free; struct td_valve_stats stats; }; #define td_valve_for_each_stored_request(_req, _next, _valve) \ list_for_each_entry_safe(_req, _next, &(_valve)->stor, entry) #define td_valve_for_each_forwarded_request(_req, _next, _valve) \ list_for_each_entry_safe(_req, _next, &(_valve)->forw, entry) #define TD_VALVE_CONNECT_INTERVAL 2 /* s */ #define TD_VALVE_RDLIMIT (1<<0) #define TD_VALVE_WRLIMIT (1<<1) #define TD_VALVE_KILLED (1<<31) static void valve_schedule_retry(td_valve_t *); static void valve_conn_receive(td_valve_t *); static void valve_conn_request(td_valve_t *, unsigned long); static void valve_forward_stored_requests(td_valve_t *); static void valve_kill(td_valve_t *); #define DBG(_f, _a...) if (1) { tlog_syslog(TLOG_DBG, _f, ##_a); } #define INFO(_f, _a...) tlog_syslog(TLOG_INFO, "valve: " _f, ##_a) #define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f " in %s:%d", \ ##_a, __func__, __LINE__) #define ERR(_f, _a...) tlog_syslog(TLOG_WARN, "ERROR: " _f " in %s:%d", \ ##_a, __func__, __LINE__) #define VERR(_err, _f, _a...) tlog_syslog(TLOG_WARN, \ "ERROR: err=%d (%s), " _f ".", \ _err, strerror(-(_err)), ##_a) #undef PERROR #define PERROR(_f, _a...) VERR(-errno, _f, ##_a) #define BUG() do { \ ERR("Aborting"); \ td_panic(); \ } while (0) #define BUG_ON(_cond) \ if (unlikely(_cond)) { \ ERR("(%s) = %ld", #_cond, (long)(_cond)); \ BUG(); \ } #define WARN_ON(_cond) ({ \ int __cond = _cond; \ if (unlikely(__cond)) \ WARN("(%s) = %ld", #_cond, (long)(_cond)); \ __cond; \ }) #define ARRAY_SIZE(_a) (sizeof(_a)/sizeof((_a)[0])) #define TREQ_SIZE(_treq) ((unsigned int)(_treq.secs) << 9) static td_valve_request_t * valve_alloc_request(td_valve_t *valve) { td_valve_request_t *req = NULL; if (valve->n_free) req = valve->free[--valve->n_free]; return req; } static void valve_free_request(td_valve_t *valve, td_valve_request_t *req) { BUG_ON(valve->n_free >= ARRAY_SIZE(valve->free)); list_del_init(&req->entry); valve->free[valve->n_free++] = req; } static void __valve_sock_event(event_id_t id, char mode, void *private) { td_valve_t *valve = private; valve_conn_receive(valve); valve_forward_stored_requests(valve); } static void valve_set_done_pending(td_valve_t *valve) { WARN_ON(valve->done == 0); tapdisk_server_mask_event(valve->sched_id, 0); } static void valve_clear_done_pending(td_valve_t *valve) { WARN_ON(valve->done != 0); tapdisk_server_mask_event(valve->sched_id, 1); } static void __valve_sched_event(event_id_t id, char mode, void *private) { td_valve_t *valve = private; if (likely(valve->done > 0)) /* flush valve->done */ valve_conn_request(valve, 0); } static void valve_sock_close(td_valve_t *valve) { if (valve->sock >= 0) { close(valve->sock); valve->sock = -1; } if (valve->sock_id >= 0) { tapdisk_server_unregister_event(valve->sock_id); valve->sock_id = -1; } if (valve->sched_id >= 0) { tapdisk_server_unregister_event(valve->sched_id); valve->sched_id = -1; } } static int valve_sock_open(td_valve_t *valve) { struct sockaddr_un addr = { .sun_family = AF_UNIX }; int s, id, err; s = socket(AF_UNIX, SOCK_STREAM, 0); if (s < 0) { PERROR("socket"); err = -errno; goto fail; } valve->sock = s; if (valve->brname[0] == '/') strncpy(addr.sun_path, valve->brname, sizeof(addr.sun_path)); else snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s", TD_VALVE_SOCKDIR, valve->brname); err = connect(valve->sock, &addr, sizeof(addr)); if (err) { err = -errno; goto fail; } id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, valve->sock, 0, __valve_sock_event, valve); if (id < 0) { err = id; goto fail; } valve->sock_id = id; id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, -1, 0, __valve_sched_event, valve); if (id < 0) { err = id; goto fail; } valve->sched_id = id; INFO("Connected to %s", addr.sun_path); valve->cred = 0; valve->need = 0; valve->done = 0; valve_clear_done_pending(valve); return 0; fail: valve_sock_close(valve); return err; } static int valve_sock_send(td_valve_t *valve, const void *msg, size_t size) { ssize_t n; n = send(valve->sock, msg, size, MSG_DONTWAIT); if (n < 0) return -errno; if (n != size) return -EPROTO; return 0; } static int valve_sock_recv(td_valve_t *valve, void *msg, size_t size) { ssize_t n; n = recv(valve->sock, msg, size, MSG_DONTWAIT); if (n < 0) return -errno; return n; } static void __valve_retry_timeout(event_id_t id, char mode, void *private) { td_valve_t *valve = private; int err; err = valve_sock_open(valve); if (!err) tapdisk_server_unregister_event(valve->retry_id); } static void valve_schedule_retry(td_valve_t *valve) { int id; BUG_ON(valve->sock_id >= 0); id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, -1, TD_VALVE_CONNECT_INTERVAL, __valve_retry_timeout, valve); BUG_ON(id < 0); valve->retry_id = id; } static void valve_conn_open(td_valve_t *valve) { int err; BUG_ON(valve->flags & TD_VALVE_KILLED); err = valve_sock_open(valve); if (err) { WARN("%s: %s", valve->brname, strerror(-err)); valve_schedule_retry(valve); } } static void valve_conn_close(td_valve_t *valve, int reset) { td_valve_request_t *req, *next; valve_sock_close(valve); if (reset) td_valve_for_each_stored_request(req, next, valve) { td_forward_request(req->treq); valve->stats.forw++; valve_free_request(valve, req); } WARN_ON(!list_empty(&valve->stor)); } static void valve_conn_reset(td_valve_t *valve) { valve_conn_close(valve, 1); valve_conn_open(valve); } void valve_conn_receive(td_valve_t *valve) { unsigned long buf[32], cred = 0; ssize_t n; int i, err; n = valve_sock_recv(valve, buf, sizeof(buf)); if (!n) { err = -ECONNRESET; goto reset; } if (n < 0) { err = n; if (err != -EAGAIN) goto reset; } for (i = 0; i < n / sizeof(buf[0]); i++) { err = WARN_ON(buf[i] >= TD_RLB_REQUEST_MAX); if (err) goto kill; cred += buf[i]; } if (cred > valve->need) { err = -EINVAL; goto reset; } valve->cred += cred; valve->need -= cred; return; reset: VERR(err, "resetting connection"); valve_conn_reset(valve); return; kill: ERR("Killing valve."); valve_kill(valve); } static void valve_conn_request(td_valve_t *valve, unsigned long size) { struct td_valve_req _req; int err; _req.need = size; _req.done = valve->done; valve->need += size; valve->done = 0; valve_clear_done_pending(valve); err = valve_sock_send(valve, &_req, sizeof(_req)); if (!err) return; VERR(err, "resetting connection"); valve_conn_reset(valve); } static int valve_expend_request(td_valve_t *valve, const td_request_t treq) { if (valve->flags & TD_VALVE_KILLED) return 0; if (valve->sock < 0) return 0; if (valve->cred < TREQ_SIZE(treq)) return -EAGAIN; valve->cred -= TREQ_SIZE(treq); return 0; } static void __valve_complete_treq(td_request_t treq, int error) { td_valve_request_t *req = treq.cb_data; td_valve_t *valve = req->valve; BUG_ON(req->secs < treq.secs); req->secs -= treq.secs; valve->done += TREQ_SIZE(treq); valve_set_done_pending(valve); if (!req->secs) { td_complete_request(req->treq, error); valve_free_request(valve, req); } } static void valve_forward_stored_requests(td_valve_t *valve) { td_valve_request_t *req, *next; td_request_t clone; int err; td_valve_for_each_stored_request(req, next, valve) { err = valve_expend_request(valve, req->treq); if (err) break; clone = req->treq; clone.cb = __valve_complete_treq; clone.cb_data = req; td_forward_request(clone); valve->stats.forw++; list_move(&req->entry, &valve->forw); } } static int valve_store_request(td_valve_t *valve, td_request_t treq) { td_valve_request_t *req; req = valve_alloc_request(valve); if (!req) return -EBUSY; valve_conn_request(valve, TREQ_SIZE(treq)); req->treq = treq; req->secs = treq.secs; list_add_tail(&req->entry, &valve->stor); valve->stats.stor++; return 0; } static void valve_kill(td_valve_t *valve) { valve->flags |= TD_VALVE_KILLED; valve_conn_close(valve, 1); } static void valve_init(td_valve_t *valve, unsigned long flags) { int i; memset(valve, 0, sizeof(*valve)); INIT_LIST_HEAD(&valve->stor); INIT_LIST_HEAD(&valve->forw); valve->sock = -1; valve->sock_id = -1; valve->retry_id = -1; valve->sched_id = -1; valve->flags = flags; for (i = ARRAY_SIZE(valve->reqv) - 1; i >= 0; i--) { td_valve_request_t *req = &valve->reqv[i]; req->valve = valve; INIT_LIST_HEAD(&req->entry); valve_free_request(valve, req); } } static int td_valve_close(td_driver_t *driver) { td_valve_t *valve = driver->data; WARN_ON(!list_empty(&valve->stor)); WARN_ON(!list_empty(&valve->forw)); valve_conn_close(valve, 0); if (valve->brname) { free(valve->brname); valve->brname = NULL; } return 0; } static int td_valve_open(td_driver_t *driver, const char *name, td_flag_t flags) { td_valve_t *valve = driver->data; int err; valve_init(valve, TD_VALVE_WRLIMIT); valve->brname = strdup(name); if (!valve->brname) { err = -errno; goto fail; } valve_conn_open(valve); return 0; fail: td_valve_close(driver); return err; } static void td_valve_queue_request(td_driver_t *driver, td_request_t treq) { td_valve_t *valve = driver->data; int err; switch (treq.op) { case TD_OP_READ: if (valve->flags & TD_VALVE_RDLIMIT) break; goto forward; case TD_OP_WRITE: if (valve->flags & TD_VALVE_WRLIMIT) break; goto forward; default: BUG(); } err = valve_expend_request(valve, treq); if (!err) goto forward; err = valve_store_request(valve, treq); if (err) td_complete_request(treq, -EBUSY); return; forward: td_forward_request(treq); valve->stats.forw++; } static int td_valve_get_parent_id(td_driver_t *driver, td_disk_id_t *id) { return -EINVAL; } static int td_valve_validate_parent(td_driver_t *driver, td_driver_t *parent_driver, td_flag_t flags) { return -EINVAL; } static void td_valve_stats(td_driver_t *driver, td_stats_t *st) { td_valve_t *valve = driver->data; td_valve_request_t *req, *next; int n_reqs; tapdisk_stats_field(st, "bridge", "d", valve->brname); tapdisk_stats_field(st, "flags", "#x", valve->flags); tapdisk_stats_field(st, "cred", "d", valve->cred); tapdisk_stats_field(st, "need", "d", valve->need); tapdisk_stats_field(st, "done", "d", valve->done); /* * stored is [ waiting, total-waits ] */ n_reqs = 0; td_valve_for_each_stored_request(req, next, valve) n_reqs++; tapdisk_stats_field(st, "stor", "["); tapdisk_stats_val(st, "d", n_reqs); tapdisk_stats_val(st, "llu", valve->stats.stor); tapdisk_stats_leave(st, ']'); /* * forwarded is [ in-flight, total-requests ] */ n_reqs = 0; td_valve_for_each_forwarded_request(req, next, valve) n_reqs++; tapdisk_stats_field(st, "forw", "["); tapdisk_stats_val(st, "d", n_reqs); tapdisk_stats_val(st, "llu", valve->stats.forw); tapdisk_stats_leave(st, ']'); } struct tap_disk tapdisk_valve = { .disk_type = "tapdisk_valve", .flags = 0, .private_data_size = sizeof(td_valve_t), .td_open = td_valve_open, .td_close = td_valve_close, .td_queue_read = td_valve_queue_request, .td_queue_write = td_valve_queue_request, .td_get_parent_id = td_valve_get_parent_id, .td_validate_parent = td_valve_validate_parent, .td_stats = td_valve_stats, }; blktap-2.0.90/drivers/tapdisk-utils.c0000644000000000000000000001374711664745551016226 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef __linux__ #include #endif #define SYSLOG_NAMES #include #include "tapdisk.h" #include "tapdisk-log.h" #include "tapdisk-utils.h" #include "tapdisk-syslog.h" #define MIN(a,b) (((a) < (b)) ? (a) : (b)) static int tapdisk_syslog_facility_by_name(const char *name) { int facility; CODE *c; facility = -1; for (c = facilitynames; c->c_name != NULL; ++c) if (!strcmp(c->c_name, name)) { facility = c->c_val; break; } return facility; } int tapdisk_syslog_facility(const char *arg) { int facility; char *endptr; if (arg) { facility = strtol(arg, &endptr, 0); if (*endptr == 0) return facility; facility = tapdisk_syslog_facility_by_name(arg); if (facility >= 0) return facility; } return LOG_DAEMON; } char* tapdisk_syslog_ident(const char *name) { char ident[TD_SYSLOG_IDENT_MAX+1]; size_t size, len; pid_t pid; pid = getpid(); size = sizeof(ident); len = 0; len = snprintf(NULL, 0, "[%d]", pid); len = snprintf(ident, size - len, "%s", name); len += snprintf(ident + len, size - len, "[%d]", pid); return strdup(ident); } size_t tapdisk_syslog_strftime(char *buf, size_t size, const struct timeval *tv) { const char *mon[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" }; struct tm tm; /* * TIMESTAMP := " "
" " ":" ":" . * Local time, no locales. */ localtime_r(&tv->tv_sec, &tm); return snprintf(buf, size, "%s %2d %02d:%02d:%02d", mon[tm.tm_mon], tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec); } size_t tapdisk_syslog_strftv(char *buf, size_t size, const struct timeval *tv) { struct tm tm; localtime_r(&tv->tv_sec, &tm); return snprintf(buf, size, "[%02d:%02d:%02d.%03ld]", tm.tm_hour, tm.tm_min, tm.tm_sec, (long)tv->tv_usec / 1000); } int tapdisk_set_resource_limits(void) { int err; struct rlimit rlim; rlim.rlim_cur = RLIM_INFINITY; rlim.rlim_max = RLIM_INFINITY; err = setrlimit(RLIMIT_MEMLOCK, &rlim); if (err == -1) { EPRINTF("RLIMIT_MEMLOCK failed: %d\n", errno); return -errno; } err = mlockall(MCL_CURRENT | MCL_FUTURE); if (err == -1) { EPRINTF("mlockall failed: %d\n", errno); return -errno; } #define CORE_DUMP #if defined(CORE_DUMP) err = setrlimit(RLIMIT_CORE, &rlim); if (err == -1) EPRINTF("RLIMIT_CORE failed: %d\n", errno); #endif return 0; } int tapdisk_namedup(char **dup, const char *name) { *dup = NULL; if (strnlen(name, MAX_NAME_LEN) >= MAX_NAME_LEN) return -ENAMETOOLONG; *dup = strdup(name); if (!*dup) return -ENOMEM; return 0; } /*Get Image size, secsize*/ int tapdisk_get_image_size(int fd, uint64_t *_sectors, uint32_t *_sector_size) { struct stat stat; uint64_t sectors, bytes; uint32_t sector_size; sectors = 0; sector_size = 0; *_sectors = 0; *_sector_size = 0; if (fstat(fd, &stat)) { DPRINTF("ERROR: fstat failed, Couldn't stat image"); return -EINVAL; } if (S_ISBLK(stat.st_mode)) { /*Accessing block device directly*/ if (ioctl(fd,BLKGETSIZE64,&bytes)==0) { sectors = bytes >> SECTOR_SHIFT; } else if (ioctl(fd,BLKGETSIZE,§ors)!=0) { DPRINTF("ERR: BLKGETSIZE and BLKGETSIZE64 failed, couldn't stat image"); return -EINVAL; } /*Get the sector size*/ #if defined(BLKSSZGET) { sector_size = DEFAULT_SECTOR_SIZE; ioctl(fd, BLKSSZGET, §or_size); if (sector_size != DEFAULT_SECTOR_SIZE) DPRINTF("Note: sector size is %u (not %d)\n", sector_size, DEFAULT_SECTOR_SIZE); } #else sector_size = DEFAULT_SECTOR_SIZE; #endif } else { /*Local file? try fstat instead*/ sectors = (stat.st_size >> SECTOR_SHIFT); sector_size = DEFAULT_SECTOR_SIZE; } if (sectors == 0) { sectors = 16836057ULL; sector_size = DEFAULT_SECTOR_SIZE; } return 0; } #ifdef __linux__ int tapdisk_linux_version(void) { struct utsname uts; unsigned int version, patchlevel, sublevel; int n, err; err = uname(&uts); if (err) return -errno; n = sscanf(uts.release, "%u.%u.%u", &version, &patchlevel, &sublevel); if (n != 3) return -ENOSYS; return KERNEL_VERSION(version, patchlevel, sublevel); } #else int tapdisk_linux_version(void) { return -ENOSYS; } #endif blktap-2.0.90/drivers/Makefile.am0000644000000000000000000000515711664745551015315 0ustar rootroot AM_CFLAGS = -Wall AM_CFLAGS += -Werror AM_CPPFLAGS = -D_GNU_SOURCE AM_CPPFLAGS += -I$(top_srcdir)/include libexec_PROGRAMS = tapdisk tapdisk_SOURCES = tapdisk2.c tapdisk_LDADD = libtapdisk.la noinst_PROGRAMS = tapdisk-stream tapdisk_stream_LDADD = libtapdisk.la sbin_PROGRAMS = td-util sbin_PROGRAMS += td-rated td_util_SOURCES = td.c td_util_LDADD = libtapdisk.la noinst_LTLIBRARIES = libtapdisk.la libtapdisk_la_SOURCES = tapdisk.h libtapdisk_la_SOURCES += scheduler.c libtapdisk_la_SOURCES += scheduler.h libtapdisk_la_SOURCES += tapdisk-control.c libtapdisk_la_SOURCES += tapdisk-control.h libtapdisk_la_SOURCES += tapdisk-vbd.c libtapdisk_la_SOURCES += tapdisk-vbd.h libtapdisk_la_SOURCES += linux-blktap.h libtapdisk_la_SOURCES += tapdisk-blktap.c libtapdisk_la_SOURCES += tapdisk-blktap.h libtapdisk_la_SOURCES += tapdisk-image.c libtapdisk_la_SOURCES += tapdisk-image.h libtapdisk_la_SOURCES += tapdisk-driver.c libtapdisk_la_SOURCES += tapdisk-driver.h libtapdisk_la_SOURCES += tapdisk-disktype.c libtapdisk_la_SOURCES += tapdisk-disktype.h libtapdisk_la_SOURCES += tapdisk-interface.c libtapdisk_la_SOURCES += tapdisk-interface.h libtapdisk_la_SOURCES += tapdisk-server.c libtapdisk_la_SOURCES += tapdisk-server.h libtapdisk_la_SOURCES += tapdisk-queue.c libtapdisk_la_SOURCES += tapdisk-queue.h libtapdisk_la_SOURCES += libaio-compat.h libtapdisk_la_SOURCES += tapdisk-filter.c libtapdisk_la_SOURCES += tapdisk-filter.h libtapdisk_la_SOURCES += tapdisk-logfile.c libtapdisk_la_SOURCES += tapdisk-logfile.h libtapdisk_la_SOURCES += tapdisk-log.c libtapdisk_la_SOURCES += tapdisk-log.h libtapdisk_la_SOURCES += tapdisk-utils.c libtapdisk_la_SOURCES += tapdisk-utils.h libtapdisk_la_SOURCES += tapdisk-syslog.c libtapdisk_la_SOURCES += tapdisk-syslog.h libtapdisk_la_SOURCES += tapdisk-stats.c libtapdisk_la_SOURCES += tapdisk-stats.h libtapdisk_la_SOURCES += tapdisk-storage.c libtapdisk_la_SOURCES += tapdisk-storage.h libtapdisk_la_SOURCES += tapdisk-loglimit.c libtapdisk_la_SOURCES += tapdisk-loglimit.h libtapdisk_la_SOURCES += io-optimize.c libtapdisk_la_SOURCES += io-optimize.h libtapdisk_la_SOURCES += lock.c libtapdisk_la_SOURCES += lock.h libtapdisk_la_SOURCES += atomicio.c libtapdisk_la_SOURCES += atomicio.h libtapdisk_la_SOURCES += block-aio.c libtapdisk_la_SOURCES += block-ram.c libtapdisk_la_SOURCES += block-cache.c libtapdisk_la_SOURCES += block-vhd.c libtapdisk_la_SOURCES += block-valve.c libtapdisk_la_SOURCES += block-valve.h libtapdisk_la_SOURCES += block-vindex.c libtapdisk_la_SOURCES += block-lcache.c libtapdisk_la_SOURCES += block-llcache.c libtapdisk_la_LIBADD = ../vhd/lib/libvhd.la libtapdisk_la_LIBADD += -laio blktap-2.0.90/drivers/tapdisk-ring.c0000644000000000000000000002152511664745551016016 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "tapdisk-ring.h" static int tapdisk_uring_create_ctlfd(td_uring_t *ring) { int fd, err; struct sockaddr_un saddr; if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_family)) >= sizeof(saddr.sun_family)) return -ENAMETOOLONG; fd = socket(AF_UNIX, SOCK_STREAM, 0); if (fd == -1) return -errno; memset(&saddr, 0, sizeof(struct sockaddr_un)); saddr.sun_family = AF_UNIX; memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path)); err = unlink(ring->ctlfd_path); if (err == -1 && errno != ENOENT) { err = -errno; goto fail; } err = bind(fd, &saddr, sizeof(struct sockaddr_un)); if (err == -1) { err = -errno; goto fail; } err = listen(fd, 1); if (err == -1) { err = -errno; goto fail; } ring->ctlfd = fd; return 0; fail: close(fd); return err; } static void tapdisk_uring_destroy_ctlfd(td_uring_t *ring) { if (ring->ctlfd) { close(ring->ctlfd); ring->ctlfd = 0; } if (ring->ctlfd_path) { unlink(ring->ctlfd_path); free(ring->ctlfd_path); ring->ctlfd_path = NULL; } } static int tapdisk_uring_connect_ctlfd(td_uring_t *ring) { int fd, err; struct sockaddr_un saddr; if (strnlen(ring->ctlfd_path, sizeof(saddr.sun_path)) >= sizeof(saddr.sun_path)) return -ENAMETOOLONG; fd = socket(AF_UNIX, SOCK_STREAM, 0); if (fd == -1) return -errno; memset(&saddr, 0, sizeof(struct sockaddr_un)); saddr.sun_family = AF_UNIX; memcpy(saddr.sun_path, ring->ctlfd_path, strlen(ring->ctlfd_path)); err = connect(fd, &saddr, sizeof(saddr)); if (err == -1) { err = -errno; goto fail; } ring->ctlfd = fd; return 0; fail: close(fd); return err; } static void tapdisk_uring_disconnect_ctlfd(td_uring_t *ring) { if (ring->ctlfd) close(ring->ctlfd); free(ring->ctlfd_path); ring->ctlfd_path = NULL; } static int tapdisk_uring_create_shmem(td_uring_t *ring) { int fd, err; fd = shm_open(ring->shmem_path, O_CREAT | O_RDWR, 0750); if (fd == -1) return -errno; err = ftruncate(fd, ring->shmem_size); if (err == -1) { err = -errno; goto out; } ring->shmem = mmap(NULL, ring->shmem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (ring->shmem == MAP_FAILED) { ring->shmem = NULL; err = -errno; goto out; } err = 0; out: close(fd); return err; } static void tapdisk_uring_destroy_shmem(td_uring_t *ring) { if (ring->shmem) { munmap(ring->shmem, ring->shmem_size); ring->shmem = NULL; } if (ring->shmem_path) { shm_unlink(ring->shmem_path); free(ring->shmem_path); ring->shmem_path = NULL; } } static int tapdisk_uring_connect_shmem(td_uring_t *ring) { int fd, err; td_uring_header_t header, *p; fd = shm_open(ring->shmem_path, O_RDWR); if (fd == -1) return -errno; p = mmap(NULL, sizeof(td_uring_header_t), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (p == MAP_FAILED) { err = -errno; goto out; } memcpy(&header, p, sizeof(td_uring_header_t)); munmap(p, sizeof(td_uring_header_t)); if (memcmp(header.cookie, TAPDISK_URING_COOKIE, sizeof(header.cookie))) { err = -EINVAL; goto out; } if (header.version != TD_URING_CURRENT_VERSION) { err = -EINVAL; goto out; } ring->ring_size = header.ring_size; ring->data_size = header.data_size; ring->shmem_size = header.shmem_size; ring->shmem = mmap(NULL, ring->shmem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (ring->shmem == MAP_FAILED) { rint->shmem = NULL; err = -errno; goto out; } err = 0; out: close(fd); return err; } static void tapdisk_uring_disconnect_shmem(td_uring_t *ring) { if (ring->shmem) munmap(ring->shmem, ring->shmem_size); free(ring->shmem_path); ring->shmem_path = NULL; } int tapdisk_uring_create(td_uring_t *ring, const char *location, uint32_t ring_size, uint32_t data_size) { int fd, err; memset(ring, 0, sizeof(td_uring_t)); ring->ring_size = ring_size; ring->data_size = data_size; ring->shmem_size = ring_size + data_size + sizeof(td_uring_header_t); err = asprintf(&ring->shmem_path, "%s.shm", location); if (err == -1) { ring->shmem_path = NULL; err = -errno; goto fail; } err = asprintf(&ring->ctlfd_path, "%s.cfd", location); if (err == -1) { ring->ctlfd_path = NULL; err = -errno; goto fail; } err = tapdisk_uring_create_ctlfd(ring); if (err) goto fail; err = tapdisk_uring_create_shmem(ring); if (err) goto fail; ring->ring_area = (unsigned long)ring->shmem + sizeof(td_uring_header_t); ring->data_area = (unsigned long)ring->ring_area + ring->ring_size; return 0; fail: tapdisk_uring_destroy(ring); return err; } int tapdisk_uring_destroy(td_uring_t *ring) { tapdisk_uring_destroy_shmem(ring); tapdisk_uring_destroy_ctlfd(ring); return 0; } int tapdisk_uring_connect(td_uring_t *ring, const char *location) { int fd, err; memset(ring, 0, sizeof(td_uring_t)); err = asprintf(&ring->shmem_path, "%s.shm", location); if (err == -1) { ring->shmem_path = NULL; err = -errno; goto fail; } err = asprintf(&ring->ctlfd_path, "%s.cfd", location); if (err == -1) { ring->ctlfd_path = NULL; err = -errno; goto fail; } err = tapdisk_uring_connect_ctlfd(ring); if (err) goto fail; err = tapdisk_uring_connect_shmem(ring); if (err) goto fail; err = 0; fail: } int tapdisk_uring_disconnect(td_uring_t *ring) { tapdisk_uring_disconnect_shmem(ring); tapdisk_uring_disconnect_ctlfd(ring); return 0; } static int tapdisk_ring_read_message(int fd, td_uring_message_t *message, int timeout) { fd_set readfds; int ret, len, offset; struct timeval tv, *t; t = NULL; offset = 0; len = sizeof(td_uring_message_t); if (timeout) { tv.tv_sec = timeout; tv.tv_usec = 0; t = &tv; } while (offset < len) { FD_ZERO(&readfds); FD_SET(fd, &readfds); /* we don't bother reinitializing tv. at worst, it will wait a * bit more time than expected. */ ret = select(fd + 1, &readfds, NULL, NULL, t); if (ret == -1) break; else if (FD_ISSET(fd, &readfds)) { ret = read(fd, message + offset, len - offset); if (ret <= 0) break; offset += ret; } else break; } if (offset != len) return -EIO; return 0; } static int tapdisk_ring_write_message(int fd, td_uring_message_t *message, int timeout) { fd_set writefds; int ret, len, offset; struct timeval tv, *t; t = NULL; offset = 0; len = sizeof(td_uring_message_t); if (timeout) { tv.tv_sec = timeout; tv.tv_usec = 0; t = &tv; } while (offset < len) { FD_ZERO(&writefds); FD_SET(fd, &writefds); /* we don't bother reinitializing tv. at worst, it will wait a * bit more time than expected. */ ret = select(fd + 1, NULL, &writefds, NULL, t); if (ret == -1) break; else if (FD_ISSET(fd, &writefds)) { ret = write(fd, message + offset, len - offset); if (ret <= 0) break; offset += ret; } else break; } if (offset != len) return -EIO; return 0; } int tapdisk_uring_poll(td_uring_t *ring) { int err; td_uring_message_t message; err = tapdisk_uring_read_message(ring->ctlfd, &message, 1); if (err) return err; if (message.type != TAPDISK_URING_MESSAGE_KICK) return -EINVAL; return 0; } int tapdisk_uring_kick(td_uring_t *ring) { td_uring_message_t message; memset(&message, 0, sizeof(td_uring_message_t)); message.type = TAPDISK_URING_MESSAGE_KICK; return tapdisk_uring_write_message(ring->ctlfd, &message, 1); } blktap-2.0.90/drivers/block-aio.c0000644000000000000000000001720111664745551015256 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include "tapdisk.h" #include "tapdisk-driver.h" #include "tapdisk-interface.h" #define MAX_AIO_REQS TAPDISK_DATA_REQUESTS struct tdaio_state; struct aio_request { td_request_t treq; struct tiocb tiocb; struct tdaio_state *state; }; struct tdaio_state { int fd; td_driver_t *driver; int aio_free_count; struct aio_request aio_requests[MAX_AIO_REQS]; struct aio_request *aio_free_list[MAX_AIO_REQS]; }; /*Get Image size, secsize*/ static int tdaio_get_image_info(int fd, td_disk_info_t *info) { int ret; unsigned long long bytes; struct stat stat; ret = fstat(fd, &stat); if (ret != 0) { DPRINTF("ERROR: fstat failed, Couldn't stat image"); return -EINVAL; } if (S_ISBLK(stat.st_mode)) { /*Accessing block device directly*/ info->size = 0; if (ioctl(fd,BLKGETSIZE64,&bytes)==0) { info->size = bytes >> SECTOR_SHIFT; } else if (ioctl(fd,BLKGETSIZE,&info->size)!=0) { DPRINTF("ERR: BLKGETSIZE and BLKGETSIZE64 failed, couldn't stat image"); return -EINVAL; } DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " "sector_shift [%llu]\n", (long long unsigned)(info->size << SECTOR_SHIFT), (long long unsigned)info->size); /*Get the sector size*/ #if defined(BLKSSZGET) { info->sector_size = DEFAULT_SECTOR_SIZE; ioctl(fd, BLKSSZGET, &info->sector_size); if (info->sector_size != DEFAULT_SECTOR_SIZE) DPRINTF("Note: sector size is %ld (not %d)\n", info->sector_size, DEFAULT_SECTOR_SIZE); } #else info->sector_size = DEFAULT_SECTOR_SIZE; #endif } else { /*Local file? try fstat instead*/ info->size = (stat.st_size >> SECTOR_SHIFT); info->sector_size = DEFAULT_SECTOR_SIZE; DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " "sector_shift [%llu]\n", (long long unsigned)(info->size << SECTOR_SHIFT), (long long unsigned)info->size); } if (info->size == 0) { info->size =((uint64_t) 16836057); info->sector_size = DEFAULT_SECTOR_SIZE; } info->info = 0; return 0; } /* Open the disk file and initialize aio state. */ int tdaio_open(td_driver_t *driver, const char *name, td_flag_t flags) { int i, fd, ret, o_flags; struct tdaio_state *prv; ret = 0; prv = (struct tdaio_state *)driver->data; DPRINTF("block-aio open('%s')", name); memset(prv, 0, sizeof(struct tdaio_state)); prv->aio_free_count = MAX_AIO_REQS; for (i = 0; i < MAX_AIO_REQS; i++) prv->aio_free_list[i] = &prv->aio_requests[i]; /* Open the file */ o_flags = O_DIRECT | O_LARGEFILE | ((flags & TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR); fd = open(name, o_flags); if ( (fd == -1) && (errno == EINVAL) ) { /* Maybe O_DIRECT isn't supported. */ o_flags &= ~O_DIRECT; fd = open(name, o_flags); if (fd != -1) DPRINTF("WARNING: Accessing image without" "O_DIRECT! (%s)\n", name); } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name); if (fd == -1) { DPRINTF("Unable to open [%s] (%d)!\n", name, 0 - errno); ret = 0 - errno; goto done; } ret = tdaio_get_image_info(fd, &driver->info); if (ret) { close(fd); goto done; } prv->fd = fd; done: return ret; } void tdaio_complete(void *arg, struct tiocb *tiocb, int err) { struct aio_request *aio = (struct aio_request *)arg; struct tdaio_state *prv = aio->state; td_complete_request(aio->treq, err); prv->aio_free_list[prv->aio_free_count++] = aio; } void tdaio_queue_read(td_driver_t *driver, td_request_t treq) { int size; uint64_t offset; struct aio_request *aio; struct tdaio_state *prv; prv = (struct tdaio_state *)driver->data; size = treq.secs * driver->info.sector_size; offset = treq.sec * (uint64_t)driver->info.sector_size; if (prv->aio_free_count == 0) goto fail; aio = prv->aio_free_list[--prv->aio_free_count]; aio->treq = treq; aio->state = prv; td_prep_read(&aio->tiocb, prv->fd, treq.buf, size, offset, tdaio_complete, aio); td_queue_tiocb(driver, &aio->tiocb); return; fail: td_complete_request(treq, -EBUSY); } void tdaio_queue_write(td_driver_t *driver, td_request_t treq) { int size; uint64_t offset; struct aio_request *aio; struct tdaio_state *prv; prv = (struct tdaio_state *)driver->data; size = treq.secs * driver->info.sector_size; offset = treq.sec * (uint64_t)driver->info.sector_size; if (prv->aio_free_count == 0) goto fail; aio = prv->aio_free_list[--prv->aio_free_count]; aio->treq = treq; aio->state = prv; td_prep_write(&aio->tiocb, prv->fd, treq.buf, size, offset, tdaio_complete, aio); td_queue_tiocb(driver, &aio->tiocb); return; fail: td_complete_request(treq, -EBUSY); } int tdaio_close(td_driver_t *driver) { struct tdaio_state *prv = (struct tdaio_state *)driver->data; close(prv->fd); return 0; } int tdaio_get_parent_id(td_driver_t *driver, td_disk_id_t *id) { return TD_NO_PARENT; } int tdaio_validate_parent(td_driver_t *driver, td_driver_t *pdriver, td_flag_t flags) { return -EINVAL; } void tdaio_stats(td_driver_t *driver, td_stats_t *st) { struct tdaio_state *prv = (struct tdaio_state *)driver->data; int n_pending; n_pending = MAX_AIO_REQS - prv->aio_free_count; tapdisk_stats_field(st, "reqs", "{"); tapdisk_stats_field(st, "max", "lu", MAX_AIO_REQS); tapdisk_stats_field(st, "pending", "d", n_pending); tapdisk_stats_leave(st, '}'); } struct tap_disk tapdisk_aio = { .disk_type = "tapdisk_aio", .flags = 0, .private_data_size = sizeof(struct tdaio_state), .td_open = tdaio_open, .td_close = tdaio_close, .td_queue_read = tdaio_queue_read, .td_queue_write = tdaio_queue_write, .td_get_parent_id = tdaio_get_parent_id, .td_validate_parent = tdaio_validate_parent, .td_debug = NULL, .td_stats = tdaio_stats, }; blktap-2.0.90/drivers/io-optimize.c0000644000000000000000000003527011664745551015671 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include "io-optimize.h" #include "tapdisk-log.h" #if (!defined(TEST) && defined(DEBUG)) #define DBG(ctx, f, a...) tlog_write(TLOG_DBG, f, ##a) #elif defined(TEST) #define DBG(ctx, f, a...) printf(f, ##a) #else #define DBG(ctx, f, a...) ((void)0) #endif void opio_free(struct opioctx *ctx) { free(ctx->opios); ctx->opios = NULL; free(ctx->free_opios); ctx->free_opios = NULL; free(ctx->iocb_queue); ctx->iocb_queue = NULL; free(ctx->event_queue); ctx->event_queue = NULL; } int opio_init(struct opioctx *ctx, int num_iocbs) { int i; memset(ctx, 0, sizeof(struct opioctx)); ctx->num_opios = num_iocbs; ctx->free_opio_cnt = num_iocbs; ctx->opios = calloc(1, sizeof(struct opio) * num_iocbs); ctx->free_opios = calloc(1, sizeof(struct opio *) * num_iocbs); ctx->iocb_queue = calloc(1, sizeof(struct iocb *) * num_iocbs); ctx->event_queue = calloc(1, sizeof(struct io_event) * num_iocbs); if (!ctx->opios || !ctx->free_opios || !ctx->iocb_queue || !ctx->event_queue) goto fail; for (i = 0; i < num_iocbs; i++) ctx->free_opios[i] = &ctx->opios[i]; return 0; fail: opio_free(ctx); return -ENOMEM; } static inline struct opio * alloc_opio(struct opioctx *ctx) { if (ctx->free_opio_cnt <= 0) return NULL; return ctx->free_opios[--ctx->free_opio_cnt]; } static inline void free_opio(struct opioctx *ctx, struct opio *op) { memset(op, 0, sizeof(struct opio)); ctx->free_opios[ctx->free_opio_cnt++] = op; } static inline void restore_iocb(struct opio *op) { struct iocb *io = op->iocb; io->data = op->data; io->u.c.buf = op->buf; io->u.c.nbytes = op->nbytes; } static inline int iocb_optimized(struct opioctx *ctx, struct iocb *io) { unsigned long iop = (unsigned long)io->data; unsigned long start = (unsigned long)ctx->opios; unsigned long end = start + (ctx->num_opios * sizeof(struct opio)); return (iop >= start && iop < end); } static inline int contiguous_sectors(struct iocb *l, struct iocb *r) { return (l->u.c.offset + l->u.c.nbytes == r->u.c.offset); } static inline int contiguous_buffers(struct iocb *l, struct iocb *r) { return (l->u.c.buf + l->u.c.nbytes == r->u.c.buf); } static inline int contiguous_iocbs(struct iocb *l, struct iocb *r) { return ((l->aio_fildes == r->aio_fildes) && contiguous_sectors(l, r) && contiguous_buffers(l, r)); } static inline void init_opio_list(struct opio *op) { op->list.head = op->list.tail = op; } static struct opio * opio_iocb_init(struct opioctx *ctx, struct iocb *io) { struct opio *op; op = alloc_opio(ctx); if (!op) return NULL; op->buf = io->u.c.buf; op->nbytes = io->u.c.nbytes; op->offset = io->u.c.offset; op->data = io->data; op->iocb = io; io->data = op; init_opio_list(op); return op; } static inline struct opio * opio_get(struct opioctx *ctx, struct iocb *io) { if (iocb_optimized(ctx, io)) return (struct opio *)io->data; else return opio_iocb_init(ctx, io); } static int merge_tail(struct opioctx *ctx, struct iocb *head, struct iocb *io) { struct opio *ophead, *opio; ophead = opio_get(ctx, head); if (!ophead) return -ENOMEM; opio = opio_get(ctx, io); if (!opio) return -ENOMEM; opio->head = ophead; head->u.c.nbytes += io->u.c.nbytes; ophead->list.tail = ophead->list.tail->next = opio; return 0; } static int merge(struct opioctx *ctx, struct iocb *head, struct iocb *io) { if (head->aio_lio_opcode != io->aio_lio_opcode) return -EINVAL; if (!contiguous_iocbs(head, io)) return -EINVAL; return merge_tail(ctx, head, io); } #if (defined(TEST) || defined(DEBUG)) static void print_optimized_iocbs(struct opioctx *ctx, struct opio *op, int *cnt) { char pref[10]; while (op) { snprintf(pref, 10, " %d: ", (*cnt)++); __print_iocb(ctx, op->iocb, pref); op = op->next; } } static void print_merged_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs) { int i, cnt; char pref[10]; struct iocb *io; struct opio *op; DBG(ctx, "merged iocbs:\n"); for (i = 0, cnt = 0; i < num_iocbs; i++) { io = iocbs[i]; snprintf(pref, 10, "%d: ", cnt++); __print_iocb(ctx, io, pref); if (iocb_optimized(ctx, io)) { op = (struct opio *)io->data; print_optimized_iocbs(ctx, op->next, &cnt); } } } #else #define print_optimized_iocbs(...) #define print_merged_iocbs(...) #endif int io_merge(struct opioctx *ctx, struct iocb **queue, int num) { int i, on_queue; struct iocb *io, **q; if (!num) return 0; on_queue = 0; q = ctx->iocb_queue; memcpy(q, queue, num * sizeof(struct iocb *)); for (i = 1; i < num; i++) { io = q[i]; if (merge(ctx, queue[on_queue], io) != 0) queue[++on_queue] = io; } print_merged_iocbs(ctx, queue, on_queue + 1); return ++on_queue; } static int expand_iocb(struct opioctx *ctx, struct iocb **queue, struct iocb *io) { int idx; struct opio *op, *next; idx = 0; op = (struct opio *)io->data; while (op) { next = op->next; restore_iocb(op); queue[idx++] = op->iocb; free_opio(ctx, op); op = next; } return idx; } int io_expand_iocbs(struct opioctx *ctx, struct iocb **queue, int idx, int num) { int i, on_queue; struct iocb *io, **q; if (!num) return 0; on_queue = 0; q = ctx->iocb_queue; memcpy(q, queue, num * sizeof(struct iocb *)); for (i = idx; i < num; i++) { io = q[i]; if (!iocb_optimized(ctx, io)) queue[on_queue++] = io; else on_queue += expand_iocb(ctx, queue + on_queue, io); } return on_queue; } static int expand_event(struct opioctx *ctx, struct io_event *event, struct io_event *queue, int idx) { int err; struct iocb *io; struct io_event *ep; struct opio *ophead, *op, *next; io = event->obj; ophead = (struct opio *)io->data; op = ophead; if (event->res == io->u.c.nbytes) err = 0; else if ((int)event->res < 0) err = (int)event->res; else err = -EIO; while (op) { next = op->next; ep = &queue[idx++]; ep->obj = op->iocb; ep->res = (err ? err : op->nbytes); restore_iocb(op); free_opio(ctx, op); op = next; } return idx; } int io_split(struct opioctx *ctx, struct io_event *events, int num) { int on_queue; struct iocb *io; struct io_event *ep, *q; if (!num) return 0; on_queue = 0; q = ctx->event_queue; memcpy(q, events, num * sizeof(struct io_event)); for (ep = q; num-- > 0; ep++) { io = ep->obj; if (!iocb_optimized(ctx, io)) events[on_queue++] = *ep; else on_queue = expand_event(ctx, ep, events, on_queue); } return on_queue; } /****************************************************************************** debug print functions ******************************************************************************/ static inline void __print_iocb(struct opioctx *ctx, struct iocb *io, char *prefix) { DBG(ctx, "%soff: %08llx, nbytes: %04lx, buf: %p, type: %s, data: %08lx," " optimized: %d\n", prefix, io->u.c.offset, io->u.c.nbytes, io->u.c.buf, (io->aio_lio_opcode == IO_CMD_PREAD ? "read" : "write"), (unsigned long)io->data, iocb_optimized(ctx, io)); } #define print_iocb(ctx, io) __print_iocb(ctx, io, "") /****************************************************************************** end debug print functions ******************************************************************************/ #if defined(TEST) #define hmask 0x80000000UL #define smask 0x40000000UL #define make_data(idx, is_head, sparse) \ (void *)((idx) | ((is_head) ? hmask : 0) | ((sparse) ? smask : 0)) #define data_idx(data) (int)((unsigned long)(data) & (0x0fffffff)) #define data_is_head(data) (((unsigned long)(data) & hmask) ? 1 : 0) #define data_is_sparse(data) (((unsigned long)(data) & smask) ? 1 : 0) static void usage(void) { fprintf(stderr, "usage: io_optimize [-n num_runs] " "[-i num_iocbs] [-s num_secs] [-r random_seed]\n"); exit(-1); } static int xalloc_cnt, xfree_cnt; static inline char * xalloc(int size) { char *buf = malloc(size); if (!buf) { fprintf(stderr, "xalloc failed\n"); exit(ENOMEM); } xalloc_cnt++; return buf; } static inline void xfree(void *buf) { free(buf); xfree_cnt++; } static void randomize_iocbs(struct iocb **iocbs, int num_iocbs, int num_secs) { int i, j; i = 0; while (i < num_iocbs) { char *buf; short type; int segs, sparse_mem; uint64_t offset, nbytes; type = (random() % 10 < 5 ? IO_CMD_PREAD : IO_CMD_PWRITE); offset = ((random() % num_secs) << 9); if (random() % 10 < 4) { segs = 1; nbytes = (((random() % 7) + 1) << 9); } else { segs = (random() % 10) + 1; nbytes = 4096; } if (i + segs > num_iocbs) segs = (num_iocbs - i); sparse_mem = (random() % 10 < 2 ? 1 : 0); if (sparse_mem) buf = xalloc(nbytes); else buf = xalloc(segs * nbytes); for (j = 0; j < segs; j++) { struct iocb *io = iocbs[i + j]; io->aio_lio_opcode = type; io->u.c.nbytes = nbytes; io->u.c.offset = offset; io->u.c.buf = buf; offset += nbytes; io->data = make_data(i + j, (j == 0), sparse_mem); if (j + 1 < segs && sparse_mem) buf = xalloc(nbytes); else buf += nbytes; } i += segs; } } static int simulate_io(struct iocb **iocbs, struct io_event *events, int num_iocbs) { int i, done; struct iocb *io; struct io_event *ep; if (num_iocbs > 1) done = (random() % (num_iocbs - 1)) + 1; else done = num_iocbs; for (i = 0; i < done; i++) { io = iocbs[i]; ep = &events[i]; ep->obj = io; ep->res = (random() % 10 < 8 ? io->u.c.nbytes : 0); } return done; } static inline void process_events(struct opioctx *ctx, struct iocb *iocb_list, struct io_event *events, int num) { int i; struct iocb *io; for (i = 0; i < num; i++) { io = events[i].obj; print_iocb(ctx, io); if (data_idx(io->data) != (io - iocb_list)) { printf("corrupt data! data_idx = %d, io = %d\n", data_idx(io->data), (io - iocb_list)); exit(-1); } if (data_is_head(io->data) || data_is_sparse(io->data)) xfree(io->u.c.buf); memset(io, 0, sizeof(struct iocb)); } } static inline void init_optest(struct iocb *iocb_list, struct iocb **iocbs, struct io_event *events, int num) { int i; memset(iocb_list, 0, num * sizeof(struct iocb)); memset(events, 0, num * sizeof(struct io_event)); for (i = 0; i < num; i++) iocbs[i] = &iocb_list[i]; } static void print_iocbs(struct opioctx *ctx, struct iocb **iocbs, int num_iocbs) { int i; char pref[10]; struct iocb *io; DBG(ctx, "iocbs:\n"); for (i = 0; i < num_iocbs; i++) { io = iocbs[i]; snprintf(pref, 10, "%d: ", i); __print_iocb(ctx, io, pref); } } static void print_events(struct opioctx *ctx, struct io_event *events, int num_events) { int i; struct iocb *io; for (i = 0; i < num_events; i++) { io = events[i].obj; print_iocb(ctx, io); } } int main(int argc, char **argv) { uint64_t num_secs; struct opioctx ctx; struct io_event *events; int i, c, num_runs, num_iocbs, seed; struct iocb *iocb_list, **iocbs, **ioqueue; num_runs = 1; num_iocbs = 300; seed = time(NULL); num_secs = ((4ULL << 20) >> 9); /* 4GB disk */ while ((c = getopt(argc, argv, "n:i:s:r:h")) != -1) { switch (c) { case 'n': num_runs = atoi(optarg); break; case 'i': num_iocbs = atoi(optarg); break; case 's': num_secs = strtoull(optarg, NULL, 10); break; case 'r': seed = atoi(optarg); break; case 'h': usage(); case '?': fprintf(stderr, "Unrecognized option: -%c\n", optopt); usage(); } } printf("Running %d tests with %d iocbs on %llu sectors, seed = %d\n", num_runs, num_iocbs, num_secs, seed); srand(seed); iocb_list = malloc(num_iocbs * sizeof(struct iocb)); iocbs = malloc(num_iocbs * sizeof(struct iocb *)); events = malloc(num_iocbs * sizeof(struct io_event)); if (!iocb_list || !iocbs || !events || opio_init(&ctx, num_iocbs)) { fprintf(stderr, "initialization failed\n"); exit(ENOMEM); } for (i = 0; i < num_runs; i++) { int op_rem, op_done, num_split, num_events, num_done; ioqueue = iocbs; init_optest(iocb_list, ioqueue, events, num_iocbs); randomize_iocbs(ioqueue, num_iocbs, num_secs); print_iocbs(&ctx, ioqueue, num_iocbs); op_done = 0; num_done = 0; op_rem = io_merge(&ctx, ioqueue, num_iocbs); print_iocbs(&ctx, ioqueue, op_rem); print_merged_iocbs(&ctx, ioqueue, op_rem); while (num_done < num_iocbs) { DBG(&ctx, "optimized remaining: %d\n", op_rem); DBG(&ctx, "simulating\n"); num_events = simulate_io(ioqueue + op_done, events, op_rem); print_events(&ctx, events, num_events); DBG(&ctx, "splitting %d\n", num_events); num_split = io_split(&ctx, events, num_events); print_events(&ctx, events, num_split); DBG(&ctx, "processing %d\n", num_split); process_events(&ctx, iocb_list, events, num_split); op_rem -= num_events; op_done += num_events; num_done += num_split; } DBG(&ctx, "run %d: processed: %d, xallocs: %d, xfrees: %d\n", i, num_done, xalloc_cnt, xfree_cnt); if (xalloc_cnt != xfree_cnt) exit(-1); xalloc_cnt = xfree_cnt = 0; } free(iocbs); free(events); free(iocb_list); opio_free(&ctx); return 0; } #endif blktap-2.0.90/drivers/tapdisk-logfile.c0000644000000000000000000001263711664745551016504 0ustar rootroot/* * Copyright (c) 2009, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include "tapdisk-logfile.h" #include "tapdisk-utils.h" #define MIN(a,b) (((a) < (b)) ? (a) : (b)) static inline size_t page_align(size_t size) { size_t page_size = sysconf(_SC_PAGE_SIZE); return (size + page_size - 1) & ~(page_size - 1); } static void tapdisk_logfile_free_buffer(td_logfile_t *log) { if (log->vbuf) { munmap(log->vbuf, page_align(log->vbufsz)); log->vbuf = NULL; } } static int tapdisk_logfile_init_buffer(td_logfile_t *log, size_t size) { int prot, flags, err; if (!size) return -EINVAL; prot = PROT_READ|PROT_WRITE; flags = MAP_ANONYMOUS|MAP_PRIVATE; log->vbuf = mmap(NULL, page_align(size), prot, flags, -1, 0); if (log->vbuf == MAP_FAILED) { log->vbuf = NULL; goto fail; } err = mlock(log->vbuf, page_align(size)); if (err) goto fail; log->vbufsz = size; return 0; fail: tapdisk_logfile_free_buffer(log); err = -errno; return err; } int tapdisk_logfile_unlink(td_logfile_t *log) { int err; err = unlink(log->path); if (err) err = -errno; return err; } static int __tapdisk_logfile_rename(td_logfile_t *log, const char *newpath) { const size_t max = sizeof(log->path); int err; if (!strcmp(log->path, newpath)) return 0; if (strlen(newpath) > max) return -ENAMETOOLONG; err = rename(log->path, newpath); if (err) { err = -errno; return err; } strncpy(log->path, newpath, max); return 0; } static int tapdisk_logfile_name(char *path, size_t size, const char *dir, const char *ident, const char *suffix) { const size_t max = MIN(size, TD_LOGFILE_PATH_MAX); return snprintf(path, max, "%s/%s.%d%s", dir, ident, getpid(), suffix); } int tapdisk_logfile_rename(td_logfile_t *log, const char *dir, const char *ident, const char *suffix) { char newpath[TD_LOGFILE_PATH_MAX+1]; tapdisk_logfile_name(newpath, sizeof(newpath), dir, ident, suffix); return __tapdisk_logfile_rename(log, newpath); } void tapdisk_logfile_close(td_logfile_t *log) { if (log->file) { fclose(log->file); log->file = NULL; } tapdisk_logfile_free_buffer(log); } int tapdisk_logfile_open(td_logfile_t *log, const char *dir, const char *ident, const char *ext, size_t bufsz) { int err; memset(log, 0, sizeof(log)); tapdisk_logfile_name(log->path, sizeof(log->path), dir, ident, ext); log->file = fopen(log->path, "w"); if (!log->file) { err = -errno; goto fail; } err = tapdisk_logfile_init_buffer(log, bufsz); if (err) goto fail; return 0; fail: tapdisk_logfile_unlink(log); tapdisk_logfile_close(log); return err; } int tapdisk_logfile_setvbuf(td_logfile_t *log, int mode) { int err = 0; if (log->file) { err = setvbuf(log->file, log->vbuf, mode, log->vbufsz); if (err) err = -errno; } return err; } ssize_t tapdisk_logfile_vprintf(td_logfile_t *log, const char *fmt, va_list ap) { char buf[1024]; size_t size, n; ssize_t len; struct timeval tv; if (!log->file) return -EBADF; gettimeofday(&tv, NULL); size = sizeof(buf); len = 0; len += tapdisk_syslog_strftime(buf, size, &tv); len += snprintf(buf + len, size - len, ": "); len += tapdisk_syslog_strftv(buf + len, size - len, &tv); len += snprintf(buf + len, size - len, " "); len += vsnprintf(buf + len, size - len, fmt, ap); if (buf[len-1] != '\n') len += snprintf(buf + len, size - len, "\n"); n = fwrite(buf, len, 1, log->file); if (n != len) len = -ferror(log->file); return len; } ssize_t tapdisk_logfile_printf(td_logfile_t *log, const char *fmt, ...) { va_list ap; int rv; va_start(ap, fmt); rv = tapdisk_logfile_vprintf(log, fmt, ap); va_end(ap); return rv; } int tapdisk_logfile_flush(td_logfile_t *log) { int rv = EOF; if (log->file) rv = fflush(log->file); return rv; } blktap-2.0.90/drivers/profile.h0000644000000000000000000000654211664745551015071 0ustar rootroot/* Copyright (c) 2007, XenSource Inc. * All rights reserved. */ #ifndef __TAP_PROFILE_H__ #define __TAP_PROFILE_H__ #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include #include //#define PROFILING //#define LOGGING #define TAPPROF_IN 1 #define TAPPROF_OUT 2 struct profile_times { char *fn_name; uint64_t in, out_sum, cnt; }; struct profile_info { FILE *log; int size; char *name; unsigned long long seq; struct profile_times *pt; }; #ifdef PROFILING static inline void tp_open(struct profile_info *prof, char *tap_name, char *log_name, int size) { memset(prof, 0, sizeof(struct profile_info)); #ifdef LOGGING prof->log = fopen(log_name, "w"); #endif prof->size = size; prof->name = strdup(tap_name); prof->pt = malloc(sizeof(struct profile_times) * prof->size); if (prof->pt) memset(prof->pt, 0, sizeof(struct profile_times) * prof->size); } static inline void tp_close(struct profile_info *prof) { int i; struct profile_times *pt; for (i = 0; i < prof->size; i++) { pt = &prof->pt[i]; if (pt->fn_name) { syslog(LOG_DEBUG, "%s: %s: cnt: %llu, avg time: %llu\n", prof->name, pt->fn_name, pt->cnt, ((pt->cnt) ? (pt->out_sum / pt->cnt) : 0)); free(pt->fn_name); } } #ifdef LOGGING if (prof->log) fclose(prof->log); #endif free(prof->name); if (prof->pt) free(prof->pt); } static inline u64 tp_get_id(struct profile_info *prof) { return prof->seq++; } static inline int tp_fn_id(struct profile_info *prof, const char *name) { int i; struct profile_times *pt; for (i = 0; i < prof->size; i++) { pt = &prof->pt[i]; if (!pt->fn_name) return i; if (!strcmp(pt->fn_name, name)) return i; } return prof->size - 1; } static inline void __tp_in(struct profile_info *prof, const char *func) { long long _time; int idx = tp_fn_id(prof, func); struct profile_times *pt = &prof->pt[idx]; if (!pt->fn_name) pt->fn_name = strdup(func); asm volatile(".byte 0x0f, 0x31" : "=A" (_time)); pt->in = _time; } #define tp_in(prof) __tp_in(prof, __func__) static inline void __tp_out(struct profile_info *prof, const char *func) { long long _time; int idx = tp_fn_id(prof, func); struct profile_times *pt = &prof->pt[idx]; if (!pt->fn_name || !pt->in) return; asm volatile(".byte 0x0f, 0x31" : "=A" (_time)); pt->cnt++; pt->out_sum += (_time - pt->in); pt->in = 0; } #define tp_out(prof) __tp_out(prof, __func__) static inline void __tp_log(struct profile_info *prof, u64 id, const char *func, int direction) { long long _time; asm volatile(".byte 0x0f, 0x31" : "=A" (_time)); if (direction == TAPPROF_IN) __tp_in(prof, func); else __tp_out(prof, func); #ifdef LOGGING if (prof->log) fprintf(prof->log, "%s: %s: %llu, %lld\n", func, ((direction == TAPPROF_IN) ? "in" : "out"), id, _time); #endif } #define tp_log(prof, id, direction) __tp_log(prof, id, __func__, direction) #else #define tp_open(prof, tname, lname, size) ((void)0) #define tp_close(prof) ((void)0) #define tp_in(prof) ((void)0) #define tp_out(prof) ((void)0) #define tp_log(prof, sec, direction) ((void)0) #endif #endif blktap-2.0.90/drivers/tapdisk-loglimit.c0000644000000000000000000000543311664745551016677 0ustar rootroot/* * Copyright (c) 2011, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Simple log rate limiting. Allow for bursts, then drop messages * until some interval expired. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "tapdisk-loglimit.h" #include "compiler.h" #include "list.h" void tapdisk_loglimit_init(td_loglimit_t *rl, int burst, int interval) { rl->burst = burst; rl->interval = interval; rl->count = 0; rl->dropped = 0; gettimeofday(&rl->ts, NULL); } static void timeradd_ms(struct timeval *tv, long ms) { tv->tv_usec += ms * 1000; if (tv->tv_usec > 1000000) { tv->tv_sec += tv->tv_usec / 1000000; tv->tv_usec %= 1000000; } } static void tapdisk_loglimit_update(td_loglimit_t *rl, struct timeval *now) { struct timeval next = rl->ts; timeradd_ms(&next, rl->interval); if (timercmp(&next, now, <)) { rl->count = 0; rl->ts = *now; } } static void tapdisk_loglimit_update_now(td_loglimit_t *rl) { struct timeval now; gettimeofday(&now, NULL); tapdisk_loglimit_update(rl, &now); } int tapdisk_loglimit_pass(td_loglimit_t *rl) { if (!rl->interval) return 1; /* unlimited */ if (unlikely(rl->count >= rl->burst)) { tapdisk_loglimit_update_now(rl); if (rl->count >= rl->burst) { rl->dropped++; return 0; } } rl->count++; return 1; } blktap-2.0.90/drivers/tapdisk.h0000644000000000000000000001643511664745551015072 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Some notes on the tap_disk interface: * * tap_disk aims to provide a generic interface to easily implement new * types of image accessors. The structure-of-function-calls is similar * to disk interfaces used in qemu/denali/etc, with the significant * difference being the expectation of asynchronous rather than synchronous * I/O. The asynchronous interface is intended to allow lots of requests to * be pipelined through a disk, without the disk requiring any of its own * threads of control. As such, a batch of requests is delivered to the disk * using: * * td_queue_[read,write]() * * and passing in a completion callback, which the disk is responsible for * tracking. Disks should transform these requests as necessary and return * the resulting iocbs to tapdisk using td_prep_[read,write]() and * td_queue_tiocb(). * * NOTE: tapdisk uses the number of sectors submitted per request as a * ref count. Plugins must use the callback function to communicate the * completion -- or error -- of every sector submitted to them. * * td_get_parent_id returns: * 0 if parent id successfully retrieved * TD_NO_PARENT if no parent exists * -errno on error */ #ifndef _TAPDISK_H_ #define _TAPDISK_H_ #include #include #include "list.h" #include "compiler.h" #include "tapdisk-log.h" #include "tapdisk-utils.h" #include "tapdisk-stats.h" #define MAX_SEGMENTS_PER_REQ 11 #define MAX_REQUESTS 32U #define SECTOR_SHIFT 9 #define DEFAULT_SECTOR_SIZE 512 #define TAPDISK_DATA_REQUESTS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ) //#define BLK_NOT_ALLOCATED (-99) #define TD_NO_PARENT 1 #define MAX_RAMDISK_SIZE 1024000 /*500MB disk limit*/ #define TD_OP_READ 0 #define TD_OP_WRITE 1 #define TD_OPEN_QUIET 0x00001 #define TD_OPEN_QUERY 0x00002 #define TD_OPEN_RDONLY 0x00004 #define TD_OPEN_STRICT 0x00008 #define TD_OPEN_SHAREABLE 0x00010 #define TD_OPEN_ADD_CACHE 0x00020 #define TD_OPEN_VHD_INDEX 0x00040 #define TD_OPEN_LOG_DIRTY 0x00080 #define TD_OPEN_LOCAL_CACHE 0x00100 #define TD_OPEN_REUSE_PARENT 0x00200 #define TD_OPEN_SECONDARY 0x00400 #define TD_OPEN_STANDBY 0x00800 #define TD_IGNORE_ENOSPC 0x01000 #define TD_CREATE_SPARSE 0x00001 #define TD_CREATE_MULTITYPE 0x00002 #define td_flag_set(word, flag) ((word) |= (flag)) #define td_flag_clear(word, flag) ((word) &= ~(flag)) #define td_flag_test(word, flag) ((word) & (flag)) typedef uint16_t td_uuid_t; typedef uint32_t td_flag_t; typedef uint64_t td_sector_t; typedef struct td_disk_id td_disk_id_t; typedef struct td_disk_info td_disk_info_t; typedef struct td_request td_request_t; typedef struct td_driver_handle td_driver_t; typedef struct td_image_handle td_image_t; typedef struct td_sector_count td_sector_count_t; typedef struct td_vbd_request td_vbd_request_t; typedef struct td_vbd_handle td_vbd_t; /* * Prototype of the callback to activate as requests complete. */ typedef void (*td_callback_t)(td_request_t, int); typedef void (*td_vreq_callback_t)(td_vbd_request_t*, int, void*, int); struct td_disk_id { char *name; int type; int flags; }; struct td_disk_info { td_sector_t size; long sector_size; uint32_t info; }; struct td_iovec { void *base; unsigned int secs; }; struct td_vbd_request { int op; td_sector_t sec; struct td_iovec *iov; int iovcnt; td_vreq_callback_t cb; void *token; const char *name; int error; int prev_error; int submitting; int secs_pending; int num_retries; struct timeval ts; struct timeval last_try; td_vbd_t *vbd; struct list_head next; struct list_head *list_head; }; struct td_request { int op; void *buf; td_sector_t sec; int secs; td_image_t *image; td_callback_t cb; void *cb_data; int sidx; td_vbd_request_t *vreq; }; /* * Structure describing the interface to a virtual disk implementation. * See note at the top of this file describing this interface. */ struct tap_disk { const char *disk_type; td_flag_t flags; int private_data_size; int (*td_open) (td_driver_t *, const char *, td_flag_t); int (*td_close) (td_driver_t *); int (*td_get_parent_id) (td_driver_t *, td_disk_id_t *); int (*td_validate_parent) (td_driver_t *, td_driver_t *, td_flag_t); void (*td_queue_read) (td_driver_t *, td_request_t); void (*td_queue_write) (td_driver_t *, td_request_t); void (*td_debug) (td_driver_t *); void (*td_stats) (td_driver_t *, td_stats_t *); }; struct td_sector_count { td_sector_t rd; td_sector_t wr; }; static inline void td_sector_count_add(td_sector_count_t *s, td_sector_t v, int write) { if (write) s->wr += v; else s->rd += v; } void td_panic(void); #endif blktap-2.0.90/drivers/tapdisk-stats.c0000644000000000000000000000730711664745551016217 0ustar rootroot/* * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include "tapdisk.h" #include "tapdisk-stats.h" #define BUG_ON(_cond) if (_cond) { td_panic(); } static void __stats_vsprintf(td_stats_t *st, const char *fmt, va_list ap) { size_t size = st->buf + st->size - st->pos; st->pos += vsnprintf(st->pos, size, fmt, ap); } static void __printf(2, 3) __stats_sprintf(td_stats_t *st, const char *fmt, ...) { va_list ap; va_start(ap, fmt); __stats_vsprintf(st, fmt, ap); va_end(ap); } static void __stats_enter(td_stats_t *st) { st->depth++; BUG_ON(st->depth > TD_STATS_MAX_DEPTH); st->n_elem[st->depth] = 0; } static void __stats_leave(td_stats_t *st) { st->depth--; } static void __stats_next(td_stats_t *st) { int n_elem; n_elem = st->n_elem[st->depth]; if (n_elem > 0) __stats_sprintf(st, ", "); st->n_elem[st->depth]++; } static void __tapdisk_stats_enter(td_stats_t *st, char t) { __stats_sprintf(st, "%c ", t); __stats_enter(st); } void tapdisk_stats_enter(td_stats_t *st, char t) { __stats_next(st); __tapdisk_stats_enter(st, t); } void tapdisk_stats_leave(td_stats_t *st, char t) { __stats_leave(st); __stats_sprintf(st, " %c", t); } static void tapdisk_stats_vval(td_stats_t *st, const char *conv, va_list ap) { char t = conv[0], fmt[32]; __stats_next(st); switch (t) { case 's': __stats_vsprintf(st, "\"%s\"", ap); break; default: sprintf(fmt, "%%%s", conv); __stats_vsprintf(st, fmt, ap); break; } } void tapdisk_stats_val(td_stats_t *st, const char *conv, ...) { va_list ap; va_start(ap, conv); tapdisk_stats_vval(st, conv, ap); va_end(ap); } void tapdisk_stats_field(td_stats_t *st, const char *key, const char *conv, ...) { va_list ap; int n_elem; char t; n_elem = st->n_elem[st->depth]++; if (n_elem > 0) __stats_sprintf(st, ", "); __stats_sprintf(st, "\"%s\": ", key); if (!conv) { __stats_sprintf(st, "null"); return; } t = conv[0]; switch (t) { case '[': case '{': __tapdisk_stats_enter(st, t); break; default: va_start(ap, conv); __stats_enter(st); tapdisk_stats_vval(st, conv, ap); __stats_leave(st); va_end(ap); } } blktap-2.0.90/drivers/tapdisk2.h0000644000000000000000000000020411664745551015137 0ustar rootroot#ifndef __TAPDISK2_H__ #define __TAPDISK2_H__ int tapdisk2_create_device(); int tapdisk2_attach_device(int, const char *); #endif blktap-2.0.90/drivers/tapdisk-queue.h0000644000000000000000000000512011664745551016201 0ustar rootroot/* Copyright (c) 2007, XenSource Inc. * All rights reserved. */ #ifndef TAPDISK_QUEUE_H #define TAPDISK_QUEUE_H #include #include "io-optimize.h" #include "scheduler.h" struct tiocb; struct tfilter; typedef void (*td_queue_callback_t)(void *arg, struct tiocb *, int err); struct tiocb { td_queue_callback_t cb; void *arg; struct iocb iocb; struct tiocb *next; }; struct tlist { struct tiocb *head; struct tiocb *tail; }; struct tqueue { int size; const struct tio *tio; void *tio_data; struct opioctx opioctx; int queued; struct iocb **iocbs; /* number of iocbs pending in the aio layer */ int iocbs_pending; /* number of tiocbs pending in the queue -- * this is likely to be larger than iocbs_pending * due to request coalescing */ int tiocbs_pending; /* iocbs may be deferred if the aio ring is full. * tapdisk_queue_complete will ensure deferred * iocbs are queued as slots become available. */ struct tlist deferred; int tiocbs_deferred; /* optional tapdisk filter */ struct tfilter *filter; uint64_t deferrals; }; struct tio { const char *name; size_t data_size; int (*tio_setup) (struct tqueue *queue, int qlen); void (*tio_destroy) (struct tqueue *queue); int (*tio_submit) (struct tqueue *queue); }; enum { TIO_DRV_LIO = 1, TIO_DRV_RWIO = 2, }; /* * Interface for request producer (i.e., tapdisk) * NB: the following functions may cause additional tiocbs to be queued: * - tapdisk_submit_tiocbs * - tapdisk_cancel_tiocbs * - tapdisk_complete_tiocbs * The *_all_tiocbs variants will handle the first two cases; * be sure to call submit after calling complete in the third case. */ #define tapdisk_queue_count(q) ((q)->queued) #define tapdisk_queue_empty(q) ((q)->queued == 0) #define tapdisk_queue_full(q) \ (((q)->tiocbs_pending + (q)->queued) >= (q)->size) int tapdisk_init_queue(struct tqueue *, int size, int drv, struct tfilter *); void tapdisk_free_queue(struct tqueue *); void tapdisk_debug_queue(struct tqueue *); void tapdisk_queue_tiocb(struct tqueue *, struct tiocb *); int tapdisk_submit_tiocbs(struct tqueue *); int tapdisk_submit_all_tiocbs(struct tqueue *); int tapdisk_cancel_tiocbs(struct tqueue *); int tapdisk_cancel_all_tiocbs(struct tqueue *); void tapdisk_prep_tiocb(struct tiocb *, int, int, char *, size_t, long long, td_queue_callback_t, void *); #endif blktap-2.0.90/drivers/tapdisk2.c0000644000000000000000000000647111664745551015146 0ustar rootroot /* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "tapdisk.h" #include "tapdisk-utils.h" #include "tapdisk-server.h" #include "tapdisk-control.h" static void usage(const char *app, int err) { fprintf(stderr, "usage: %s <-u uuid> <-c control socket>\n", app); exit(err); } static FILE * fdup(FILE *stream, const char *mode) { int fd, err; FILE *f; fd = dup(STDOUT_FILENO); if (fd < 0) goto fail; f = fdopen(fd, mode); if (!f) goto fail; return f; fail: err = -errno; if (fd >= 0) close(fd); errno = -err; return NULL; } int main(int argc, char *argv[]) { char *control; int c, err, nodaemon; FILE *out; control = NULL; nodaemon = 0; while ((c = getopt(argc, argv, "Dh")) != -1) { switch (c) { case 'D': nodaemon = 1; break; case 'h': usage(argv[0], 0); break; default: usage(argv[0], EINVAL); } } if (optind != argc) usage(argv[0], EINVAL); err = tapdisk_server_init(); if (err) { DPRINTF("failed to initialize server: %d\n", err); goto out; } out = fdup(stdout, "w"); if (!out) { err = -errno; DPRINTF("failed to dup stdout: %d\n", err); goto out; } if (!nodaemon) { err = daemon(0, 0); if (err) { DPRINTF("failed to daemonize: %d\n", errno); goto out; } } tapdisk_start_logging("tapdisk", NULL); err = tapdisk_control_open(&control); if (err) { DPRINTF("failed to open control socket: %d\n", err); goto out; } err = tapdisk_server_complete(); if (err) { DPRINTF("failed to complete server: %d\n", err); goto out; } fprintf(out, "%s\n", control); fclose(out); err = tapdisk_server_run(); out: tapdisk_control_close(); tapdisk_stop_logging(); return -err; } blktap-2.0.90/drivers/block-log.c0000644000000000000000000004034311664745551015272 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Driver to sit on top of another disk and log writes, in order * to synchronize two distinct disks * * On receipt of a control request it can export a list of dirty * sectors in the following format: * struct writerange { * u64 sector; * u32 count; * } * terminated by { 0, 0 } */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include "log.h" #include "tapdisk.h" #include "tapdisk-server.h" #include "tapdisk-driver.h" #include "tapdisk-interface.h" #define MAX_CONNECTIONS 1 typedef struct poll_fd { int fd; event_id_t id; } poll_fd_t; struct tdlog_state { uint64_t size; void* writelog; char* ctlpath; poll_fd_t ctl; int connected; poll_fd_t connections[MAX_CONNECTIONS]; char* shmpath; void* shm; log_sring_t* sring; log_back_ring_t bring; }; #define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a) #define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a) static void ctl_accept(event_id_t, char, void *); static void ctl_request(event_id_t, char, void *); /* -- write log -- */ /* large flat bitmaps don't scale particularly well either in size or scan * time, but they'll do for now */ #define BITS_PER_LONG (sizeof(unsigned long) * 8) #define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) #define BITMAP_ENTRY(_nr, _bmap) ((unsigned long*)(_bmap))[(_nr)/BITS_PER_LONG] #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG) static inline int test_bit(int nr, void* bmap) { return (BITMAP_ENTRY(nr, bmap) >> BITMAP_SHIFT(nr)) & 1; } static inline void clear_bit(int nr, void* bmap) { BITMAP_ENTRY(nr, bmap) &= ~(1UL << BITMAP_SHIFT(nr)); } static inline void set_bit(int nr, void* bmap) { BITMAP_ENTRY(nr, bmap) |= (1UL << BITMAP_SHIFT(nr)); } static inline int bitmap_size(uint64_t sz) { return sz >> 3; } static int writelog_create(struct tdlog_state *s) { uint64_t bmsize; bmsize = bitmap_size(s->size); BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize); if (!(s->writelog = calloc(bmsize, 1))) { BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize); return -1; } return 0; } static int writelog_free(struct tdlog_state *s) { if (s->writelog) free(s->writelog); return 0; } static int writelog_set(struct tdlog_state* s, uint64_t sector, int count) { int i; for (i = 0; i < count; i++) set_bit(sector + i, s->writelog); return 0; } /* if end is 0, clear to end of disk */ int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end) { if (!end) end = s->size; /* clear to word boundaries */ while (BITMAP_SHIFT(start)) clear_bit(start++, s->writelog); while (BITMAP_SHIFT(end)) clear_bit(end--, s->writelog); memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3); return 0; } /* returns last block exported (may not be end of disk if shm region * overflows) */ static uint64_t writelog_export(struct tdlog_state* s) { struct disk_range* range = s->shm; uint64_t i = 0; BDPRINTF("sector count: %"PRIu64, s->size); for (i = 0; i < s->size; i++) { if (test_bit(i, s->writelog)) { /* range start */ range->sector = i; range->count = 1; /* find end */ for (i++; i < s->size && test_bit(i, s->writelog); i++) range->count++; BDPRINTF("export: dirty extent %"PRIu64":%u", range->sector, range->count); range++; /* out of space in shared memory region */ if ((void*)range >= bmend(s->shm)) { BDPRINTF("out of space in shm region at sector %"PRIu64, i); return i; } /* undo forloop increment */ i--; } } /* NULL-terminate range list */ range->sector = 0; range->count = 0; return i; } /* -- communication channel -- */ /* remove FS special characters in up to len bytes of path */ static inline void path_escape(char* path, size_t len) { int i; for (i = 0; i < len && path[i]; i++) if (strchr(":/", path[i])) path[i] = '_'; } static char* ctl_makepath(const char* name, const char* ext) { char* res; char *file; file = strrchr(name, '/'); if (!file) { BWPRINTF("invalid name %s\n", name); return NULL; } if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) { BWPRINTF("could not allocate path"); return NULL; } path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file)); return res; } static int shmem_open(struct tdlog_state* s, const char* name) { int i, l, fd; /* device name -> path */ if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) { BWPRINTF("could not allocate shm path"); return -1; } path_escape(s->shmpath + 5, strlen(name)); if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) { BWPRINTF("could not open shared memory file %s: %s", s->shmpath, strerror(errno)); goto err; } if (ftruncate(fd, SHMSIZE) < 0) { BWPRINTF("error truncating shmem to size %u", SHMSIZE); close(fd); goto err; } s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); close(fd); if (s->shm == MAP_FAILED) { BWPRINTF("could not mmap write log shm: %s", strerror(errno)); goto err; } return 0; err: s->shm = NULL; free(s->shmpath); s->shmpath = NULL; return -1; } static int shmem_close(struct tdlog_state* s) { if (s->shm) { munmap(s->shm, SHMSIZE); s->shm = NULL; } if (s->shmpath) { shm_unlink(s->shmpath); s->shmpath = NULL; } return 0; } /* control socket */ static int ctl_open(struct tdlog_state* s, const char* name) { struct sockaddr_un saddr; if (!(s->ctlpath = ctl_makepath(name, "ctl"))) return -1; if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { BWPRINTF("error opening control socket: %s", strerror(errno)); goto err; } memset(&saddr, 0, sizeof(saddr)); saddr.sun_family = AF_UNIX; memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath)); if (unlink(s->ctlpath) && errno != ENOENT) { BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath, strerror(errno)); goto err_sock; } if (bind(s->ctl.fd, &saddr, sizeof(saddr)) < 0) { BWPRINTF("error binding control socket to %s: %s", s->ctlpath, strerror(errno)); goto err_sock; } if (listen(s->ctl.fd, 1) < 0) { BWPRINTF("error listening on control socket: %s", strerror(errno)); goto err_sock; } s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, s->ctl.fd, 0, ctl_accept, s); if (s->ctl.id < 0) { BWPRINTF("error register event handler: %s", strerror(s->ctl.id)); goto err_sock; } return 0; err_sock: close(s->ctl.fd); s->ctl.fd = -1; err: free(s->ctlpath); s->ctlpath = NULL; return -1; } static int ctl_close(struct tdlog_state* s) { while (s->connected) { tapdisk_server_unregister_event(s->connections[s->connected].id); close(s->connections[s->connected].fd); s->connections[s->connected].fd = -1; s->connections[s->connected].id = 0; s->connected--; } if (s->ctl.fd >= 0) { tapdisk_server_unregister_event(s->ctl.id); close(s->ctl.fd); s->ctl.fd = -1; s->ctl.id = 0; } if (s->ctlpath) { unlink(s->ctlpath); free(s->ctlpath); s->ctlpath = NULL; } /* XXX this must be fixed once requests are actually in flight */ /* could just drain the existing ring here first */ if (s->sring) { SHARED_RING_INIT(s->sring); BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE); } return 0; } /* walk list of open sockets, close matching fd */ static int ctl_close_sock(struct tdlog_state* s, int fd) { int i; for (i = 0; i <= s->connected; i++) { if (s->connections[i].fd == fd) { tapdisk_server_unregister_event(s->connections[i].id); close(s->connections[i].fd); s->connections[i].fd = -1; s->connections[i].id = 0; s->connected--; return 0; } } BWPRINTF("requested to close unknown socket %d", fd); return -1; } static void ctl_accept(event_id_t id, char mode, void *private) { struct tdlog_state* s = (struct tdlog_state *)private; int fd; event_id_t cid; if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) { BWPRINTF("error accepting control connection: %s", strerror(errno)); return; } if (s->connected) { BWPRINTF("control session in progress, closing new connection"); close(fd); return; } cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, fd, 0, ctl_request, s); if (cid < 0) { BWPRINTF("error registering connection event handler: %s", strerror(cid)); close(fd); return; } s->connections[s->connected].fd = fd; s->connections[s->connected].id = cid; s->connected++; } /* response format: 4 bytes shmsize, 0-terminated path */ static int ctl_get_shmpath(struct tdlog_state* s, int fd) { char msg[CTLRSPLEN_SHMP + 1]; uint32_t sz; int rc; BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)", SHMSIZE, s->shmpath); /* TMP: sanity-check shm */ sz = 0xdeadbeef; memcpy(s->shm, &sz, sizeof(sz)); sz = SHMSIZE; memcpy(msg, &sz, sizeof(sz)); snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath); if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) { BWPRINTF("error writing shmpath: %s", strerror(errno)); return -1; } return 0; } static int ctl_peek_writes(struct tdlog_state* s, int fd) { int rc; BDPRINTF("ctl: peeking bitmap"); writelog_export(s); if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) { BWPRINTF("error writing peek ack: %s", strerror(errno)); return -1; } return 0; } static int ctl_clear_writes(struct tdlog_state* s, int fd) { int rc; BDPRINTF("ctl: clearing bitmap"); writelog_clear(s, 0, 0); if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) { BWPRINTF("error writing clear ack: %s", strerror(errno)); return -1; } return 0; } /* get dirty bitmap and clear it atomically */ static int ctl_get_writes(struct tdlog_state* s, int fd) { int rc; BDPRINTF("ctl: getting bitmap"); writelog_export(s); writelog_clear(s, 0, 0); if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) { BWPRINTF("error writing get ack: %s", strerror(errno)); return -1; } return 0; } /* get requests from ring */ static int ctl_kick(struct tdlog_state* s, int fd) { RING_IDX reqstart, reqend; log_request_t req; /* XXX testing */ RING_IDX rspstart, rspend; log_response_t rsp; struct log_ctlmsg msg; int rc; reqstart = s->bring.req_cons; reqend = s->sring->req_prod; BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend); while (reqstart != reqend) { /* XXX actually submit these! */ memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req)); BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count); s->bring.req_cons = ++reqstart; rsp.sector = req.sector; rsp.count = req.count; memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp, sizeof(rsp)); s->bring.rsp_prod_pvt++; } RING_PUSH_RESPONSES(&s->bring); memset(&msg, 0, sizeof(msg)); memcpy(msg.msg, LOGCMD_KICK, 4); if ((rc = write(fd, &msg, sizeof(msg))) < 0) { BWPRINTF("error sending notify: %s", strerror(errno)); return -1; } else if (rc < sizeof(msg)) { BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg)); return -1; } return 0; } static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* msg) { if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) { return ctl_get_shmpath(s, fd); } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) { return ctl_peek_writes(s, fd); } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) { return ctl_clear_writes(s, fd); } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) { return ctl_get_writes(s, fd); } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) { return ctl_kick(s, fd); } BWPRINTF("unknown control request %.4s", msg->msg); return -1; } static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id) { int i; for (i = 0; i < s->connected; i++) if (s->connections[i].id == id) return s->connections[i].fd; BWPRINTF("unrecognized event callback id %d", id); return -1; } static void ctl_request(event_id_t id, char mode, void *private) { struct tdlog_state* s = (struct tdlog_state*)private; struct log_ctlmsg msg; int rc, i, fd = -1; fd = ctl_find_connection(s, id); if (fd == -1) return; if ((rc = read(fd, &msg, sizeof(msg))) < 0) { BWPRINTF("error reading from ctl socket %d, closing: %s", fd, strerror(errno)); ctl_close_sock(s, fd); return; } else if (rc == 0) { BDPRINTF("ctl_request: EOF, closing socket"); ctl_close_sock(s, fd); return; } else if (rc < sizeof(msg)) { BWPRINTF("short request received (%d/%zd bytes), ignoring", rc, sizeof(msg)); return; } ctl_do_request(s, fd, &msg); } /* -- interface -- */ static int tdlog_close(td_driver_t*); static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags) { struct tdlog_state* s = (struct tdlog_state*)driver->data; int rc; memset(s, 0, sizeof(*s)); s->size = driver->info.size; if ((rc = writelog_create(s))) { tdlog_close(driver); return rc; } if ((rc = shmem_open(s, name))) { tdlog_close(driver); return rc; } if ((rc = ctl_open(s, name))) { tdlog_close(driver); return rc; } s->sring = (log_sring_t*)sringstart(s->shm); SHARED_RING_INIT(s->sring); BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE); BDPRINTF("opened ctl socket"); return 0; } static int tdlog_close(td_driver_t* driver) { struct tdlog_state* s = (struct tdlog_state*)driver->data; ctl_close(s); shmem_close(s); writelog_free(s); return 0; } static void tdlog_queue_read(td_driver_t* driver, td_request_t treq) { td_forward_request(treq); } static void tdlog_queue_write(td_driver_t* driver, td_request_t treq) { struct tdlog_state* s = (struct tdlog_state*)driver->data; int rc; writelog_set(s, treq.sec, treq.secs); td_forward_request(treq); } static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id) { return -EINVAL; } static int tdlog_validate_parent(td_driver_t *driver, td_driver_t *parent, td_flag_t flags) { return 0; } struct tap_disk tapdisk_log = { .disk_type = "tapdisk_log", .private_data_size = sizeof(struct tdlog_state), .flags = 0, .td_open = tdlog_open, .td_close = tdlog_close, .td_queue_read = tdlog_queue_read, .td_queue_write = tdlog_queue_write, .td_get_parent_id = tdlog_get_parent_id, .td_validate_parent = tdlog_validate_parent, }; blktap-2.0.90/drivers/scheduler.h0000644000000000000000000000533611664745551015407 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _SCHEDULER_H_ #define _SCHEDULER_H_ #include #include "list.h" #define SCHEDULER_POLL_READ_FD 0x1 #define SCHEDULER_POLL_WRITE_FD 0x2 #define SCHEDULER_POLL_EXCEPT_FD 0x4 #define SCHEDULER_POLL_TIMEOUT 0x8 typedef int event_id_t; typedef void (*event_cb_t) (event_id_t id, char mode, void *private); typedef struct scheduler { fd_set read_fds; fd_set write_fds; fd_set except_fds; struct list_head events; int uuid; int max_fd; int timeout; int max_timeout; int depth; } scheduler_t; void scheduler_initialize(scheduler_t *); event_id_t scheduler_register_event(scheduler_t *, char mode, int fd, int timeout, event_cb_t cb, void *private); void scheduler_unregister_event(scheduler_t *, event_id_t); void scheduler_mask_event(scheduler_t *, event_id_t, int masked); void scheduler_set_max_timeout(scheduler_t *, int); int scheduler_wait_for_events(scheduler_t *); #endif blktap-2.0.90/drivers/lock.h0000644000000000000000000000441411664745551014355 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #define DEFAULT_LEASE_TIME_SECS 30 int lock(char *fn_to_lock, char *uuid, int force, int readonly, int *lease_time, int *retstat); int unlock(char *fn_to_unlock, char *uuid, int readonly, int *retstat); int lock_delta(char *fn_to_check, int *cur_lease_time, int *max_lease_time); typedef enum { LOCK_OK = 0, LOCK_EBADPARM = -1, LOCK_ENOMEM = -2, LOCK_ESTAT = -3, LOCK_EHELD_WR = -4, LOCK_EHELD_RD = -5, LOCK_EOPEN = -6, LOCK_EXLOCK_OPEN = -7, LOCK_EXLOCK_WRITE= -8, LOCK_EINODE = -9, LOCK_EUPDATE = -10, LOCK_EREAD = -11, LOCK_EREMOVE = -12, LOCK_ENOLOCK = -13, LOCK_EUSAGE = -14, } lock_error; blktap-2.0.90/drivers/tapdisk-control.c0000644000000000000000000006320111664745551016534 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include "list.h" #include "tapdisk.h" #include "tapdisk-vbd.h" #include "tapdisk-blktap.h" #include "tapdisk-utils.h" #include "tapdisk-server.h" #include "tapdisk-message.h" #include "tapdisk-disktype.h" #include "tapdisk-stats.h" #include "tapdisk-control.h" #define TD_CTL_MAX_CONNECTIONS 10 #define TD_CTL_SOCK_BACKLOG 32 #define TD_CTL_RECV_TIMEOUT 10 #define TD_CTL_SEND_TIMEOUT 10 #define TD_CTL_SEND_BUFSZ ((size_t)4096) #define DBG(_f, _a...) tlog_syslog(LOG_DEBUG, _f, ##_a) #define ERR(err, _f, _a...) tlog_error(err, _f, ##_a) #define ASSERT(_p) \ if (!(_p)) { \ EPRINTF("%s:%d: FAILED ASSERTION: '%s'\n", \ __FILE__, __LINE__, #_p); \ td_panic(); \ } #define WARN_ON(_p) \ if (_p) { \ EPRINTF("%s:%d: WARNING: '%s'\n", \ __FILE__, __LINE__, #_p); \ } struct tapdisk_ctl_conn { int fd; struct { void *buf; size_t bufsz; int event_id; int done; void *prod; void *cons; } out; struct { int event_id; int busy; } in; struct tapdisk_control_info *info; }; #define TAPDISK_MSG_REENTER (1<<0) /* non-blocking, idempotent */ #define TAPDISK_MSG_VERBOSE (1<<1) /* tell syslog about it */ struct tapdisk_control_info { void (*handler)(struct tapdisk_ctl_conn *, tapdisk_message_t *); int flags; }; struct tapdisk_control { char *path; int uuid; int socket; int event_id; int busy; int n_conn; struct tapdisk_ctl_conn __conn[TD_CTL_MAX_CONNECTIONS]; struct tapdisk_ctl_conn *conn[TD_CTL_MAX_CONNECTIONS]; }; static struct tapdisk_control td_control; static inline size_t page_align(size_t size) { size_t page_size = sysconf(_SC_PAGE_SIZE); return (size + page_size - 1) & ~(page_size - 1); } static void tapdisk_ctl_conn_uninit(struct tapdisk_ctl_conn *conn) { if (conn->out.buf) { munmap(conn->out.buf, conn->out.bufsz); conn->out.buf = NULL; } } static int tapdisk_ctl_conn_init(struct tapdisk_ctl_conn *conn, size_t bufsz) { int prot, flags, err; memset(conn, 0, sizeof(*conn)); conn->out.event_id = -1; conn->in.event_id = -1; prot = PROT_READ|PROT_WRITE; flags = MAP_ANONYMOUS|MAP_PRIVATE; conn->out.buf = mmap(NULL, bufsz, prot, flags, -1, 0); if (conn->out.buf == MAP_FAILED) { conn->out.buf = NULL; err = -ENOMEM; goto fail; } conn->out.bufsz = page_align(bufsz); return 0; fail: tapdisk_ctl_conn_uninit(conn); return err; } static int tapdisk_ctl_conn_connected(struct tapdisk_ctl_conn *conn) { return conn->fd >= 1; } static void tapdisk_ctl_conn_free(struct tapdisk_ctl_conn *conn) { struct tapdisk_ctl_conn *prev, *next; int i; i = --td_control.n_conn; /* NB. bubble the freed connection off the active list. */ prev = conn; do { ASSERT(i >= 0); next = td_control.conn[i]; td_control.conn[i] = prev; prev = next; i--; } while (next != conn); } static void tapdisk_ctl_conn_close(struct tapdisk_ctl_conn *conn) { if (conn->out.event_id >= 0) { tapdisk_server_unregister_event(conn->out.event_id); conn->out.event_id = -1; } if (conn->fd >= 0) { close(conn->fd); conn->fd = -1; tapdisk_ctl_conn_free(conn); tapdisk_server_mask_event(td_control.event_id, 0); } } static void tapdisk_ctl_conn_mask_out(struct tapdisk_ctl_conn *conn) { tapdisk_server_mask_event(conn->out.event_id, 1); } static void tapdisk_ctl_conn_unmask_out(struct tapdisk_ctl_conn *conn) { tapdisk_server_mask_event(conn->out.event_id, 0); } static ssize_t tapdisk_ctl_conn_send_buf(struct tapdisk_ctl_conn *conn) { ssize_t size; size = conn->out.prod - conn->out.cons; if (!size) return 0; size = send(conn->fd, conn->out.cons, size, MSG_DONTWAIT); if (size < 0) return -errno; conn->out.cons += size; return size; } static void tapdisk_ctl_conn_send_event(event_id_t id, char mode, void *private) { struct tapdisk_ctl_conn *conn = private; ssize_t rv; do { rv = tapdisk_ctl_conn_send_buf(conn); } while (rv > 0); if (rv == -EAGAIN) return; if (rv < 0) ERR(rv, "failure sending message at offset %td/%td\n", conn->out.cons - conn->out.buf, conn->out.prod - conn->out.buf); if (rv || conn->out.done || mode & SCHEDULER_POLL_TIMEOUT) tapdisk_ctl_conn_close(conn); else tapdisk_ctl_conn_mask_out(conn); } /* * NB. the control interface is still not properly integrated into the * server, therefore neither the scheduler. After the last close, the * server will exit but we still have a pending close response in the * output buffer. */ static void tapdisk_ctl_conn_drain(struct tapdisk_ctl_conn *conn) { struct timeval tv = { .tv_sec = TD_CTL_SEND_TIMEOUT, .tv_usec = 0 }; fd_set wfds; int n, mode; ASSERT(conn->out.done); ASSERT(conn->fd >= 0); while (tapdisk_ctl_conn_connected(conn)) { FD_ZERO(&wfds); FD_SET(conn->fd, &wfds); n = select(conn->fd + 1, NULL, &wfds, NULL, &tv); if (n < 0) break; if (n) mode = SCHEDULER_POLL_WRITE_FD; else mode = SCHEDULER_POLL_TIMEOUT; tapdisk_ctl_conn_send_event(conn->out.event_id, mode, conn); } } struct tapdisk_ctl_conn * tapdisk_ctl_conn_open(int fd) { struct tapdisk_ctl_conn *conn; if (td_control.n_conn >= TD_CTL_MAX_CONNECTIONS) return NULL; conn = td_control.conn[td_control.n_conn++]; conn->out.event_id = tapdisk_server_register_event(SCHEDULER_POLL_WRITE_FD, fd, TD_CTL_SEND_TIMEOUT, tapdisk_ctl_conn_send_event, conn); if (conn->out.event_id < 0) return NULL; conn->fd = fd; conn->out.prod = conn->out.buf; conn->out.cons = conn->out.buf; tapdisk_ctl_conn_mask_out(conn); if (td_control.n_conn >= TD_CTL_MAX_CONNECTIONS) tapdisk_server_mask_event(td_control.event_id, 1); return conn; } static size_t tapdisk_ctl_conn_write(struct tapdisk_ctl_conn *conn, void *buf, size_t size) { size_t rest; rest = conn->out.buf + conn->out.bufsz - conn->out.prod; if (rest < size) size = rest; if (!size) return 0; memcpy(conn->out.prod, buf, size); conn->out.prod += size; tapdisk_ctl_conn_unmask_out(conn); return size; } static void tapdisk_ctl_conn_release(struct tapdisk_ctl_conn *conn) { conn->out.done = 1; if (conn->out.prod == conn->out.cons) tapdisk_ctl_conn_close(conn); } static void tapdisk_control_initialize(void) { struct tapdisk_ctl_conn *conn; int i; td_control.socket = -1; td_control.event_id = -1; signal(SIGPIPE, SIG_IGN); for (i = 0; i < TD_CTL_MAX_CONNECTIONS; i++) { conn = &td_control.__conn[i]; tapdisk_ctl_conn_init(conn, TD_CTL_SEND_BUFSZ); td_control.conn[i] = conn; } td_control.n_conn = 0; DPRINTF("tapdisk-control: init, %d x %zuk buffers\n", TD_CTL_MAX_CONNECTIONS, TD_CTL_SEND_BUFSZ >> 10); } void tapdisk_control_close(void) { struct tapdisk_ctl_conn *conn; int i; DPRINTF("tapdisk-control: draining %d connections\n", td_control.n_conn); while (td_control.n_conn) { conn = td_control.conn[td_control.n_conn-1]; tapdisk_ctl_conn_drain(conn); } for (i = 0; i < TD_CTL_MAX_CONNECTIONS; i++) { conn = &td_control.__conn[i]; tapdisk_ctl_conn_uninit(conn); } DPRINTF("tapdisk-control: done\n"); if (td_control.path) { unlink(td_control.path); free(td_control.path); td_control.path = NULL; } if (td_control.socket != -1) { close(td_control.socket); td_control.socket = -1; } } static void tapdisk_control_release_connection(struct tapdisk_ctl_conn *conn) { if (conn->in.event_id) { tapdisk_server_unregister_event(conn->in.event_id); conn->in.event_id = -1; } tapdisk_ctl_conn_release(conn); } static void tapdisk_control_close_connection(struct tapdisk_ctl_conn *conn) { tapdisk_control_release_connection(conn); if (tapdisk_ctl_conn_connected(conn)) /* NB. best effort for write/close sequences. */ tapdisk_ctl_conn_send_buf(conn); tapdisk_ctl_conn_close(conn); } static int tapdisk_control_read_message(int fd, tapdisk_message_t *message, int timeout) { const int len = sizeof(tapdisk_message_t); fd_set readfds; int ret, offset, err = 0; struct timeval tv, *t; t = NULL; offset = 0; if (timeout) { tv.tv_sec = timeout; tv.tv_usec = 0; t = &tv; } memset(message, 0, sizeof(tapdisk_message_t)); while (offset < len) { FD_ZERO(&readfds); FD_SET(fd, &readfds); ret = select(fd + 1, &readfds, NULL, NULL, t); if (ret == -1) break; else if (FD_ISSET(fd, &readfds)) { ret = read(fd, message + offset, len - offset); if (ret <= 0) break; offset += ret; } else break; } if (ret < 0) err = -errno; else if (offset != len) err = -EIO; if (err) ERR(err, "failure reading message at offset %d/%d\n", offset, len); return err; } static void tapdisk_control_write_message(struct tapdisk_ctl_conn *conn, tapdisk_message_t *message) { size_t size = sizeof(*message), count; if (conn->info->flags & TAPDISK_MSG_VERBOSE) DBG("sending '%s' message (uuid = %u)\n", tapdisk_message_name(message->type), message->cookie); count = tapdisk_ctl_conn_write(conn, message, size); WARN_ON(count != size); } static int tapdisk_control_validate_request(tapdisk_message_t *request) { if (strnlen(request->u.params.path, TAPDISK_MESSAGE_MAX_PATH_LENGTH) >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) return EINVAL; return 0; } #if 0 static void tapdisk_control_list_minors(struct tapdisk_ctl_conn *conn, tapdisk_message_t *request) { int i; td_vbd_t *vbd; struct list_head *head; tapdisk_message_t response; i = 0; memset(&response, 0, sizeof(response)); response.type = TAPDISK_MESSAGE_LIST_MINORS_RSP; response.cookie = request->cookie; head = tapdisk_server_get_all_vbds(); list_for_each_entry(vbd, head, next) { td_blktap_t *tap = vbd->tap; if (!tap) continue; response.u.minors.list[i++] = tap->minor; if (i >= TAPDISK_MESSAGE_MAX_MINORS) { response.type = TAPDISK_MESSAGE_ERROR; response.u.response.error = ERANGE; break; } } response.u.minors.count = i; tapdisk_ctl_conn_write(conn, &response, 2); } #endif static void tapdisk_control_list(struct tapdisk_ctl_conn *conn, tapdisk_message_t *request) { td_vbd_t *vbd; struct list_head *head; tapdisk_message_t response; int count; memset(&response, 0, sizeof(response)); response.type = TAPDISK_MESSAGE_LIST_RSP; response.cookie = request->cookie; head = tapdisk_server_get_all_vbds(); count = 0; list_for_each_entry(vbd, head, next) count++; list_for_each_entry(vbd, head, next) { response.u.list.count = count--; response.u.list.minor = vbd->tap ? vbd->tap->minor : -1; response.u.list.state = vbd->state; response.u.list.path[0] = 0; if (vbd->name) strncpy(response.u.list.path, vbd->name, sizeof(response.u.list.path)); tapdisk_control_write_message(conn, &response); } response.u.list.count = count; response.u.list.minor = -1; response.u.list.path[0] = 0; tapdisk_control_write_message(conn, &response); } static void tapdisk_control_get_pid(struct tapdisk_ctl_conn *conn, tapdisk_message_t *request) { tapdisk_message_t response; memset(&response, 0, sizeof(response)); response.type = TAPDISK_MESSAGE_PID_RSP; response.cookie = request->cookie; response.u.tapdisk_pid = getpid(); tapdisk_control_write_message(conn, &response); } static void tapdisk_control_attach_vbd(struct tapdisk_ctl_conn *conn, tapdisk_message_t *request) { tapdisk_message_t response; char *devname = NULL; td_vbd_t *vbd; int minor, err; /* * TODO: check for max vbds per process */ vbd = tapdisk_server_get_vbd(request->cookie); if (vbd) { err = -EEXIST; goto out; } minor = request->cookie; if (minor < 0) { err = -EINVAL; goto out; } vbd = tapdisk_vbd_create(minor); if (!vbd) { err = -ENOMEM; goto out; } err = asprintf(&devname, BLKTAP2_RING_DEVICE"%d", minor); if (err == -1) { devname = NULL; err = -ENOMEM; goto fail_vbd; } err = tapdisk_vbd_attach(vbd, devname, minor); if (err) { ERR(err, "failure attaching to %s", devname); goto fail_vbd; } tapdisk_server_add_vbd(vbd); out: if (devname) free(devname); memset(&response, 0, sizeof(response)); response.type = TAPDISK_MESSAGE_ATTACH_RSP; response.cookie = request->cookie; response.u.response.error = -err; tapdisk_control_write_message(conn, &response); return; fail_vbd: tapdisk_vbd_detach(vbd); free(vbd); goto out; } static void tapdisk_control_detach_vbd(struct tapdisk_ctl_conn *conn, tapdisk_message_t *request) { tapdisk_message_t response; td_vbd_t *vbd; int err; vbd = tapdisk_server_get_vbd(request->cookie); if (!vbd) { err = -EINVAL; goto out; } if (vbd->name) { err = -EBUSY; goto out; } tapdisk_vbd_detach(vbd); if (list_empty(&vbd->images)) { tapdisk_server_remove_vbd(vbd); free(vbd); } err = 0; out: memset(&response, 0, sizeof(response)); response.type = TAPDISK_MESSAGE_DETACH_RSP; response.cookie = request->cookie; response.u.response.error = -err; tapdisk_control_write_message(conn, &response); } static void tapdisk_control_open_image(struct tapdisk_ctl_conn *conn, tapdisk_message_t *request) { int err; td_vbd_t *vbd; td_flag_t flags; tapdisk_message_t response; td_disk_info_t info; vbd = tapdisk_server_get_vbd(request->cookie); if (!vbd) { err = -EINVAL; goto out; } if (!vbd->tap) { err = -EINVAL; goto out; } if (vbd->name) { err = -EALREADY; goto out; } flags = 0; if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_RDONLY) flags |= TD_OPEN_RDONLY; if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_SHARED) flags |= TD_OPEN_SHAREABLE; if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_ADD_CACHE) flags |= TD_OPEN_ADD_CACHE; if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_VHD_INDEX) flags |= TD_OPEN_VHD_INDEX; if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_LOG_DIRTY) flags |= TD_OPEN_LOG_DIRTY; if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_ADD_LCACHE) flags |= TD_OPEN_LOCAL_CACHE; if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_REUSE_PRT) flags |= TD_OPEN_REUSE_PARENT; if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_STANDBY) flags |= TD_OPEN_STANDBY; if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_SECONDARY) { char *name = strdup(request->u.params.secondary); if (!name) { err = -errno; goto out; } vbd->secondary_name = name; flags |= TD_OPEN_SECONDARY; } err = tapdisk_vbd_open_vdi(vbd, request->u.params.path, flags, request->u.params.prt_devnum); if (err) goto out; err = tapdisk_vbd_get_disk_info(vbd, &info); if (err) goto fail_close; err = tapdisk_blktap_create_device(vbd->tap, &info, !!(flags & TD_OPEN_RDONLY)); if (err && err != -EEXIST) { err = -errno; EPRINTF("create device failed: %d\n", err); goto fail_close; } err = 0; out: memset(&response, 0, sizeof(response)); response.cookie = request->cookie; if (err) { response.type = TAPDISK_MESSAGE_ERROR; response.u.response.error = -err; } else { response.u.image.sectors = info.size; response.u.image.sector_size = info.sector_size; response.u.image.info = info.info; response.type = TAPDISK_MESSAGE_OPEN_RSP; } tapdisk_control_write_message(conn, &response); return; fail_close: tapdisk_vbd_close_vdi(vbd); if (vbd->name) { free(vbd->name); vbd->name = NULL; } goto out; } static void tapdisk_control_close_image(struct tapdisk_ctl_conn *conn, tapdisk_message_t *request) { tapdisk_message_t response; td_vbd_t *vbd; int err; vbd = tapdisk_server_get_vbd(request->cookie); if (!vbd) { err = -ENODEV; goto out; } do { err = tapdisk_blktap_remove_device(vbd->tap); if (!err || err != -EBUSY) break; tapdisk_server_iterate(); } while (conn->fd >= 0); if (err) ERR(err, "failure closing image\n"); if (err == -ENOTTY) { while (!list_empty(&vbd->pending_requests)) tapdisk_server_iterate(); err = 0; } if (err) goto out; tapdisk_vbd_close_vdi(vbd); /* NB. vbd->name free should probably belong into close_vdi, but the current blktap1 reopen-stuff likely depends on a lifetime extended until shutdown. */ free(vbd->name); vbd->name = NULL; if (!vbd->tap) { tapdisk_server_remove_vbd(vbd); free(vbd); } out: memset(&response, 0, sizeof(response)); response.type = TAPDISK_MESSAGE_CLOSE_RSP; response.cookie = request->cookie; response.u.response.error = -err; tapdisk_control_write_message(conn, &response); } static void tapdisk_control_pause_vbd(struct tapdisk_ctl_conn *conn, tapdisk_message_t *request) { int err; td_vbd_t *vbd; tapdisk_message_t response; memset(&response, 0, sizeof(response)); response.type = TAPDISK_MESSAGE_PAUSE_RSP; vbd = tapdisk_server_get_vbd(request->cookie); if (!vbd) { err = -EINVAL; goto out; } do { err = tapdisk_vbd_pause(vbd); if (!err || err != -EAGAIN) break; tapdisk_server_iterate(); } while (conn->fd >= 0); out: response.cookie = request->cookie; response.u.response.error = -err; tapdisk_control_write_message(conn, &response); } static void tapdisk_control_resume_vbd(struct tapdisk_ctl_conn *conn, tapdisk_message_t *request) { int err; td_vbd_t *vbd; tapdisk_message_t response; const char *desc = NULL; memset(&response, 0, sizeof(response)); response.type = TAPDISK_MESSAGE_RESUME_RSP; vbd = tapdisk_server_get_vbd(request->cookie); if (!vbd) { err = -EINVAL; goto out; } if (request->u.params.path[0]) desc = request->u.params.path; err = tapdisk_vbd_resume(vbd, desc); out: response.cookie = request->cookie; response.u.response.error = -err; tapdisk_control_write_message(conn, &response); } static void tapdisk_control_stats(struct tapdisk_ctl_conn *conn, tapdisk_message_t *request) { tapdisk_message_t response; td_stats_t _st, *st = &_st; td_vbd_t *vbd; size_t rv; tapdisk_stats_init(st, conn->out.buf + sizeof(response), conn->out.bufsz - sizeof(response)); if (request->cookie != (uint16_t)-1) { vbd = tapdisk_server_get_vbd(request->cookie); if (!vbd) { rv = -ENODEV; goto out; } tapdisk_vbd_stats(vbd, st); } else { struct list_head *list = tapdisk_server_get_all_vbds(); tapdisk_stats_enter(st, '['); list_for_each_entry(vbd, list, next) tapdisk_vbd_stats(vbd, st); tapdisk_stats_leave(st, ']'); } rv = tapdisk_stats_length(st); out: memset(&response, 0, sizeof(response)); response.type = TAPDISK_MESSAGE_STATS_RSP; response.cookie = request->cookie; response.u.info.length = rv; tapdisk_control_write_message(conn, &response); if (rv > 0) conn->out.prod += rv; } struct tapdisk_control_info message_infos[] = { [TAPDISK_MESSAGE_PID] = { .handler = tapdisk_control_get_pid, .flags = TAPDISK_MSG_REENTER, }, [TAPDISK_MESSAGE_LIST] = { .handler = tapdisk_control_list, .flags = TAPDISK_MSG_REENTER, }, [TAPDISK_MESSAGE_ATTACH] = { .handler = tapdisk_control_attach_vbd, .flags = TAPDISK_MSG_VERBOSE, }, [TAPDISK_MESSAGE_DETACH] = { .handler = tapdisk_control_detach_vbd, .flags = TAPDISK_MSG_VERBOSE, }, [TAPDISK_MESSAGE_OPEN] = { .handler = tapdisk_control_open_image, .flags = TAPDISK_MSG_VERBOSE, }, [TAPDISK_MESSAGE_PAUSE] = { .handler = tapdisk_control_pause_vbd, .flags = TAPDISK_MSG_VERBOSE, }, [TAPDISK_MESSAGE_RESUME] = { .handler = tapdisk_control_resume_vbd, .flags = TAPDISK_MSG_VERBOSE, }, [TAPDISK_MESSAGE_CLOSE] = { .handler = tapdisk_control_close_image, .flags = TAPDISK_MSG_VERBOSE, }, [TAPDISK_MESSAGE_STATS] = { .handler = tapdisk_control_stats, .flags = TAPDISK_MSG_REENTER, }, }; static void tapdisk_control_handle_request(event_id_t id, char mode, void *private) { int err, excl; tapdisk_message_t message, response; struct tapdisk_ctl_conn *conn = private; struct tapdisk_control_info *info; err = tapdisk_control_read_message(conn->fd, &message, 2); if (err) goto close; if (conn->in.busy) goto busy; err = tapdisk_control_validate_request(&message); if (err) goto invalid; if (message.type > TAPDISK_MESSAGE_EXIT) goto invalid; info = &message_infos[message.type]; if (!info->handler) goto invalid; if (info->flags & TAPDISK_MSG_VERBOSE) DBG("received '%s' message (uuid = %u)\n", tapdisk_message_name(message.type), message.cookie); excl = !(info->flags & TAPDISK_MSG_REENTER); if (excl) { if (td_control.busy) goto busy; td_control.busy = 1; } conn->in.busy = 1; conn->info = info; info->handler(conn, &message); conn->in.busy = 0; if (excl) td_control.busy = 0; tapdisk_control_release_connection(conn); return; error: memset(&response, 0, sizeof(response)); response.type = TAPDISK_MESSAGE_ERROR; response.u.response.error = (err ? -err : EINVAL); tapdisk_control_write_message(conn, &response); close: tapdisk_control_close_connection(conn); return; busy: err = -EBUSY; ERR(err, "rejecting message '%s' while busy\n", tapdisk_message_name(message.type)); goto error; invalid: err = -EINVAL; ERR(err, "rejecting unsupported message '%s'\n", tapdisk_message_name(message.type)); goto error; } static void tapdisk_control_accept(event_id_t id, char mode, void *private) { int err, fd; struct tapdisk_ctl_conn *conn; fd = accept(td_control.socket, NULL, NULL); if (fd == -1) { ERR(-errno, "failed to accept new control connection: %d\n", errno); return; } conn = tapdisk_ctl_conn_open(fd); if (!conn) { close(fd); ERR(-ENOMEM, "failed to allocate new control connection\n"); return; } err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, conn->fd, TD_CTL_RECV_TIMEOUT, tapdisk_control_handle_request, conn); if (err == -1) { tapdisk_control_close_connection(conn); ERR(err, "failed to register new control event\n"); return; } conn->in.event_id = err; } static int tapdisk_control_mkdir(const char *dir) { int err; char *ptr, *name, *start; err = access(dir, W_OK | R_OK); if (!err) return 0; name = strdup(dir); if (!name) return -ENOMEM; start = name; for (;;) { ptr = strchr(start + 1, '/'); if (ptr) *ptr = '\0'; err = mkdir(name, 0755); if (err && errno != EEXIST) { err = -errno; EPRINTF("failed to create directory %s: %d\n", name, err); break; } if (!ptr) break; else { *ptr = '/'; start = ptr + 1; } } free(name); return err; } static int tapdisk_control_create_socket(char **socket_path) { struct sockaddr_un saddr; int err; err = tapdisk_control_mkdir(BLKTAP2_CONTROL_DIR); if (err) { EPRINTF("failed to create directory %s: %d\n", BLKTAP2_CONTROL_DIR, err); return err; } err = asprintf(&td_control.path, "%s/%s%d", BLKTAP2_CONTROL_DIR, BLKTAP2_CONTROL_SOCKET, getpid()); if (err == -1) { td_control.path = NULL; err = (errno ? : ENOMEM); goto fail; } if (unlink(td_control.path) && errno != ENOENT) { err = errno; EPRINTF("failed to unlink %s: %d\n", td_control.path, errno); goto fail; } td_control.socket = socket(AF_UNIX, SOCK_STREAM, 0); if (td_control.socket == -1) { err = errno; EPRINTF("failed to create control socket: %d\n", err); goto fail; } memset(&saddr, 0, sizeof(saddr)); strncpy(saddr.sun_path, td_control.path, sizeof(saddr.sun_path)); saddr.sun_family = AF_UNIX; err = bind(td_control.socket, (const struct sockaddr *)&saddr, sizeof(saddr)); if (err == -1) { err = errno; EPRINTF("failed to bind to %s: %d\n", saddr.sun_path, err); goto fail; } err = listen(td_control.socket, TD_CTL_SOCK_BACKLOG); if (err == -1) { err = errno; EPRINTF("failed to listen: %d\n", err); goto fail; } err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, td_control.socket, 0, tapdisk_control_accept, NULL); if (err < 0) { EPRINTF("failed to add watch: %d\n", err); goto fail; } td_control.event_id = err; *socket_path = td_control.path; return 0; fail: tapdisk_control_close(); return err; } int tapdisk_control_open(char **path) { tapdisk_control_initialize(); return tapdisk_control_create_socket(path); } blktap-2.0.90/drivers/tapdisk-utils.h0000644000000000000000000000432611664745551016224 0ustar rootroot/* * Copyright (c) 2008, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TAPDISK_UTILS_H_ #define _TAPDISK_UTILS_H_ #include #include #define MAX_NAME_LEN 1000 #define TD_SYSLOG_IDENT_MAX 32 #define TD_SYSLOG_STRTIME_LEN 15 int tapdisk_syslog_facility(const char *); char* tapdisk_syslog_ident(const char *); size_t tapdisk_syslog_strftime(char *, size_t, const struct timeval *); size_t tapdisk_syslog_strftv(char *, size_t, const struct timeval *); int tapdisk_set_resource_limits(void); int tapdisk_namedup(char **, const char *); int tapdisk_parse_disk_type(const char *, char **, int *); int tapdisk_get_image_size(int, uint64_t *, uint32_t *); int tapdisk_linux_version(void); #endif blktap-2.0.90/drivers/tapdisk-syslog.h0000644000000000000000000000527711664745551016412 0ustar rootroot/* * Copyright (c) 2009, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __TAPDISK_SYSLOG_H__ #define __TAPDISK_SYSLOG_H__ #include #include #include "scheduler.h" typedef struct _td_syslog td_syslog_t; #define TD_SYSLOG_PACKET_MAX 1024 struct _td_syslog_stats { unsigned long long count; unsigned long long bytes; unsigned long long xmits; unsigned long long fails; unsigned long long drops; }; struct _td_syslog { char *ident; int facility; int sock; event_id_t event_id; void *buf; size_t bufsz; char *msg; char *ring; size_t ringsz; size_t prod; size_t cons; int oom; struct timeval oom_tv; struct _td_syslog_stats stats; }; int tapdisk_syslog_open(td_syslog_t *, const char *ident, int facility, size_t bufsz); void tapdisk_syslog_close(td_syslog_t *); void tapdisk_syslog_flush(td_syslog_t *); void tapdisk_syslog_stats(td_syslog_t *, int prio); int tapdisk_vsyslog(td_syslog_t *, int prio, const char *fmt, va_list ap); int tapdisk_syslog(td_syslog_t *, int prio, const char *fmt, ...); #endif /* __TAPDISK_SYSLOG_H__ */ blktap-2.0.90/drivers/tapdisk-loglimit.h0000644000000000000000000000371411664745551016704 0ustar rootroot/* * Copyright (c) 2011, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef __TAPDISK_LOGLIMIT_H__ #define __TAPDISK_LOGLIMIT_H__ #include #include "list.h" typedef struct td_loglimit td_loglimit_t; struct td_loglimit { int burst; int interval; int count; int dropped; struct timeval ts; }; void tapdisk_loglimit_init(td_loglimit_t *rl, int burst, int interval); int tapdisk_loglimit_pass(td_loglimit_t *); #endif /* __TAPDISK_LOGLIMIT_H__ */ blktap-2.0.90/drivers/tapdisk-filter.c0000644000000000000000000001467511664745551016354 0ustar rootroot/* * Copyright (c) 2007, XenSource Inc. * Copyright (c) 2010, Citrix Systems, Inc. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include #include "tapdisk-log.h" #include "tapdisk-filter.h" #define RSEED 7 #define PRE_CHECK 0 #define POST_CHECK 1 #define WRITE_INTEGRITY "buffer integrity failure after write" #define READ_INTEGRITY "disk integrity failure after read" #define DBG(f, a...) tlog_write(TLOG_WARN, f, ##a) /* * simulate IO errors by knocking request size to zero before * submitting and restoring original size before returning */ static inline void inject_fault(struct tfilter *filter, struct iocb *io) { struct fiocb *fio; if (!filter->ffree) return; fio = filter->flist[--filter->ffree]; fio->bytes = io->u.c.nbytes; fio->data = io->data; io->u.c.nbytes = 0; io->data = fio; } static inline int fault_injected(struct tfilter *filter, struct iocb *io) { unsigned long iop = (unsigned long)io->data; unsigned long start = (unsigned long)filter->fiocbs; unsigned long end = start + (filter->iocbs * sizeof(struct fiocb)); return (iop >= start && iop < end); } static inline void recover_fault(struct tfilter *filter, struct iocb *io) { struct fiocb *fio = (struct fiocb *)io->data; io->u.c.nbytes = fio->bytes; io->data = fio->data; memset(fio, 0, sizeof(struct fiocb)); filter->flist[filter->ffree++] = fio; } static inline uint64_t chksum(char *buf) { int i, num = 512 >> 3; uint64_t *p = (uint64_t *)buf; uint64_t sum = 0; for (i = 0; i < num; i++) sum += p[i]; return sum; } static inline void check_hash(struct tfilter *filter, uint64_t sec, char *buf, char *type) { uint64_t sum; struct dhash *hash; hash = filter->dhash + sec; if (!hash->time.tv_sec) return; sum = chksum(buf); if (hash->hash != chksum(buf)) { struct timeval now; gettimeofday(&now, NULL); DBG("%s: hash table: 0x%020" PRIx64 " at %012lu.%06lu, " "from disk: 0x%020" PRIx64 " at %012lu.%06lu\n", type, hash->hash, hash->time.tv_sec, hash->time.tv_usec, sum, now.tv_sec, now.tv_usec); } } static inline void insert_hash(struct tfilter *filter, uint64_t sec, char *buf) { struct dhash *hash; hash = filter->dhash + sec; hash->hash = chksum(buf); gettimeofday(&hash->time, NULL); } static void check_sector(struct tfilter *filter, int type, int rw, uint64_t sec, char *buf) { if (sec >= filter->secs) return; if (rw) { if (type == PRE_CHECK) insert_hash(filter, sec, buf); else check_hash(filter, sec, buf, WRITE_INTEGRITY); } else if (type == POST_CHECK) { check_hash(filter, sec, buf, READ_INTEGRITY); insert_hash(filter, sec, buf); } } static void check_data(struct tfilter *filter, int type, struct iocb *io) { int rw; uint64_t i; rw = (io->aio_lio_opcode == IO_CMD_PWRITE); for (i = 0; i < io->u.c.nbytes; i += 512) { char *buf = io->u.c.buf + i; uint64_t sec = (io->u.c.offset + i) >> 9; check_sector(filter, type, rw, sec, buf); } } struct tfilter * tapdisk_init_tfilter(int mode, int iocbs, uint64_t secs) { int i; struct tfilter *filter = NULL; if (!mode) return NULL; filter = calloc(1, sizeof(struct tfilter)); if (!filter) goto fail; filter->mode = mode; filter->secs = secs; filter->iocbs = iocbs; if (filter->mode & TD_INJECT_FAULTS) { filter->fiocbs = calloc(iocbs, sizeof(struct fiocb)); filter->flist = calloc(iocbs, sizeof(struct fiocb *)); if (!filter->fiocbs || !filter->flist) filter->mode &= ~TD_INJECT_FAULTS; else { srand(RSEED); filter->ffree = iocbs; for (i = 0; i < iocbs; i++) filter->flist[i] = filter->fiocbs + i; } } if (filter->mode & TD_CHECK_INTEGRITY) { filter->dhash = calloc(secs, sizeof(struct dhash)); if (!filter->dhash) filter->mode &= ~TD_CHECK_INTEGRITY; } syslog(LOG_WARNING, "WARNING: " "FILTERING IN MODE 0x%04x\n", filter->mode); return filter; fail: tapdisk_free_tfilter(filter); return NULL; } void tapdisk_free_tfilter(struct tfilter *filter) { if (!filter) return; free(filter->dhash); free(filter->flist); free(filter->fiocbs); free(filter); } void tapdisk_filter_iocbs(struct tfilter *filter, struct iocb **iocbs, int num) { int i; if (!filter) return; for (i = 0; i < num; i++) { struct iocb *io = iocbs[i]; if (filter->mode & TD_INJECT_FAULTS) { if ((random() % 100) <= TD_FAULT_RATE) { inject_fault(filter, io); continue; } } if (filter->mode & TD_CHECK_INTEGRITY) check_data(filter, PRE_CHECK, io); } } void tapdisk_filter_events(struct tfilter *filter, struct io_event *events, int num) { int i; if (!filter) return; for (i = 0; i < num; i++) { struct iocb *io = events[i].obj; if (filter->mode & TD_INJECT_FAULTS) { if (fault_injected(filter, io)) { recover_fault(filter, io); continue; } } if (filter->mode & TD_CHECK_INTEGRITY) check_data(filter, POST_CHECK, io); } } blktap-2.0.90/drivers/linux-blktap.h0000644000000000000000000000475311664745551016045 0ustar rootroot/* * Copyright (c) 2011, XenSource Inc. * All rights reserved. */ #ifndef _LINUX_BLKTAP_H #define _LINUX_BLKTAP_H /* * Control */ #define BLKTAP_IOCTL_RESPOND 1 #define BLKTAP_IOCTL_ALLOC_TAP 200 #define BLKTAP_IOCTL_FREE_TAP 201 #define BLKTAP_IOCTL_CREATE_DEVICE 208 #define BLKTAP_IOCTL_REMOVE_DEVICE 207 struct blktap_info { unsigned int ring_major; unsigned int bdev_major; unsigned int ring_minor; }; struct blktap_device_info { unsigned long long capacity; unsigned int sector_size; unsigned int physical_sector_size; unsigned long flags; unsigned long __rsvd[4]; }; #define BLKTAP_DEVICE_RO 0x00000001UL /* * I/O ring */ #ifdef __KERNEL__ #include #define BLKTAP_PAGE_SIZE PAGE_SIZE #include #define BLKTAP_RD32(_n) rounddown_pow_of_two(_n) #endif #define __BLKTAP_RING_SIZE(_sz) \ ((unsigned int) \ BLKTAP_RD32(((_sz) - offsetof(struct blktap_sring, entry)) / \ sizeof(union blktap_ring_entry))) typedef struct blktap_ring_request blktap_ring_req_t; typedef struct blktap_ring_response blktap_ring_rsp_t; struct blktap_segment { uint32_t __pad; uint8_t first_sect; uint8_t last_sect; }; #define BLKTAP_OP_READ 0 #define BLKTAP_OP_WRITE 1 #define BLKTAP_SEGMENT_MAX 11 struct blktap_ring_request { uint8_t operation; uint8_t nr_segments; uint16_t __pad; uint64_t id; uint64_t sector_number; struct blktap_segment seg[BLKTAP_SEGMENT_MAX]; }; #define BLKTAP_RSP_EOPNOTSUPP -2 #define BLKTAP_RSP_ERROR -1 #define BLKTAP_RSP_OKAY 0 struct blktap_ring_response { uint64_t id; uint8_t operation; int16_t status; }; union blktap_ring_entry { struct blktap_ring_request req; struct blktap_ring_response rsp; }; struct blktap_sring { uint32_t req_prod; uint32_t __req_event; uint32_t rsp_prod; uint32_t __rsp_event; uint8_t msg; uint8_t __rsvd[47]; union blktap_ring_entry entry[0]; }; /* * Ring messages + old ioctls (DEPRECATED) */ #define BLKTAP_RING_MESSAGE_CLOSE 3 #define BLKTAP_IOCTL_CREATE_DEVICE_COMPAT 202 #define BLKTAP_NAME_MAX 256 struct blktap2_params { char name[BLKTAP_NAME_MAX]; unsigned long long capacity; unsigned long sector_size; }; #endif /* _LINUX_BLKTAP_H */ blktap-2.0.90/drivers/tapdisk-syslog.c0000644000000000000000000002633411664745551016402 0ustar rootroot/* * Copyright (c) 2009, XenSource Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of XenSource Inc. nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * A non-blocking, buffered BSD syslog client. * * http://www.ietf.org/rfc/rfc3164.txt (FIXME: Read this.) */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #define _ISOC99_SOURCE #include #include #include #include #include #include #include #include #include #include #include "tapdisk-server.h" #include "tapdisk-syslog.h" #include "tapdisk-utils.h" #define MIN(a,b) (((a) < (b)) ? (a) : (b)) static int tapdisk_syslog_sock_send(td_syslog_t *log, const void *msg, size_t size); static int tapdisk_syslog_sock_connect(td_syslog_t *log); static void tapdisk_syslog_sock_mask(td_syslog_t *log); static void tapdisk_syslog_sock_unmask(td_syslog_t *log); static const struct sockaddr_un syslog_addr = { .sun_family = AF_UNIX, .sun_path = "/dev/log" }; #define RING_PTR(_log, _idx) \ (&(_log)->ring[(_idx) % (_log)->ringsz]) #define RING_FREE(_log) \ ((_log)->ringsz - ((_log)->prod - (_log)->cons)) /* * NB. Ring buffer. * * We allocate a number of pages as indicated by @bufsz during * initialization. From that, 1K is reserved for message staging, the * rest is cyclic ring space. * * All producer/consumer offsets wrap on size_t range, not buffer * size. Hence the RING() macros. */ static void __tapdisk_syslog_ring_init(td_syslog_t *log) { log->buf = NULL; log->bufsz = 0; log->msg = NULL; log->ring = NULL; log->ringsz = 0; } static inline size_t page_align(size_t size) { size_t page_size = sysconf(_SC_PAGE_SIZE); return (size + page_size - 1) & ~(page_size - 1); } static void tapdisk_syslog_ring_uninit(td_syslog_t *log) { if (log->buf) munmap(log->buf, log->bufsz); __tapdisk_syslog_ring_init(log); } static int tapdisk_syslog_ring_init(td_syslog_t *log, size_t size) { int prot, flags, err; __tapdisk_syslog_ring_init(log); log->bufsz = page_align(size); prot = PROT_READ|PROT_WRITE; flags = MAP_ANONYMOUS|MAP_PRIVATE; log->buf = mmap(NULL, log->bufsz, prot, flags, -1, 0); if (log->buf == MAP_FAILED) { log->buf = NULL; err = -ENOMEM; goto fail; } err = mlock(log->buf, size); if (err) { err = -errno; goto fail; } log->msg = log->buf; log->ring = log->buf + TD_SYSLOG_PACKET_MAX; log->ringsz = size - TD_SYSLOG_PACKET_MAX; return 0; fail: tapdisk_syslog_ring_uninit(log); return err; } static int tapdisk_syslog_ring_write_str(td_syslog_t *log, const char *msg, size_t len) { size_t size, prod, i; len = MIN(len, TD_SYSLOG_PACKET_MAX); size = len + 1; if (size > RING_FREE(log)) return -ENOBUFS; prod = log->prod; for (i = 0; i < len; ++i) { char c; c = msg[i]; if (c == 0) break; *RING_PTR(log, prod) = c; prod++; } *RING_PTR(log, prod) = 0; log->prod = prod + 1; return 0; } static ssize_t tapdisk_syslog_ring_read_pkt(td_syslog_t *log, char *msg, size_t size) { size_t cons; ssize_t sz; size = MIN(size, TD_SYSLOG_PACKET_MAX); sz = 0; cons = log->cons; while (sz < size) { char c; if (cons == log->prod) break; c = *RING_PTR(log, cons); msg[sz++] = c; cons++; if (c == 0) break; } return sz - 1; } static int tapdisk_syslog_ring_dispatch_one(td_syslog_t *log) { size_t len; int err; len = tapdisk_syslog_ring_read_pkt(log, log->msg, TD_SYSLOG_PACKET_MAX); if (len == -1) return -ENOMSG; err = tapdisk_syslog_sock_send(log, log->msg, len); if (err == -EAGAIN) return err; if (err) goto fail; done: log->cons += len + 1; return 0; fail: log->stats.fails++; goto done; } static void tapdisk_syslog_ring_warning(td_syslog_t *log) { int n, err; n = log->oom; log->oom = 0; err = tapdisk_syslog(log, LOG_WARNING, "tapdisk-syslog: %d messages dropped", n); if (err) log->oom = n; } static void tapdisk_syslog_ring_dispatch(td_syslog_t *log) { int err; do { err = tapdisk_syslog_ring_dispatch_one(log); } while (!err); if (log->oom) tapdisk_syslog_ring_warning(log); } static int tapdisk_syslog_vsprintf(char *buf, size_t size, int prio, const struct timeval *tv, const char *ident, const char *fmt, va_list ap) { char tsbuf[TD_SYSLOG_STRTIME_LEN+1]; size_t len; /* * PKT := PRI HEADER MSG * PRI := "<" {"0" .. "9"} ">" * HEADER := TIMESTAMP HOSTNAME * MSG := * SEP := ":" | " " | "[" */ tapdisk_syslog_strftime(tsbuf, sizeof(tsbuf), tv); len = 0; /* NB. meant to work with c99 null buffers */ len += snprintf(buf ? buf + len : NULL, buf ? size - len : 0, "<%d>%s %s: ", prio, tsbuf, ident); len += vsnprintf(buf ? buf + len : NULL, buf ? size - len : 0, fmt, ap); return MIN(len, size); } /* * NB. Sockets. * * Syslog is based on a connectionless (DGRAM) unix transport. * * While it is reliable, we cannot block on syslogd because -- as with * any IPC in tapdisk -- we could deadlock in page I/O writeback. * Hence the syslog(3) avoidance on the datapath, which this code * facilitates. * * This type of socket has a single (global) receive buffer on * syslogd's end, but no send buffer at all. The does just that: * headroom on the sender side. * * The transport is rather stateless, but we still need to connect() * the socket, or select() will find no receive buffer to block * on. While we never disconnect, connections are unreliable because * syslog may shut down. * * Reconnection will be attempted with every user message submitted. * Any send() or connect() failure other than EAGAIN discards the * message. Also, the write event handler will go on to discard any * remaining ring contents as well, once the socket is disconnected. * * In summary, no attempts to mask service blackouts in here. */ int tapdisk_vsyslog(td_syslog_t *log, int prio, const char *fmt, va_list ap) { struct timeval now; size_t len; int err; gettimeofday(&now, NULL); len = tapdisk_syslog_vsprintf(log->msg, TD_SYSLOG_PACKET_MAX, prio | log->facility, &now, log->ident, fmt, ap); log->stats.count += 1; log->stats.bytes += len; if (log->cons != log->prod) goto busy; send: err = tapdisk_syslog_sock_send(log, log->msg, len); if (!err) return 0; if (err == -ENOTCONN) { err = tapdisk_syslog_sock_connect(log); if (!err) goto send; } if (err != -EAGAIN) goto fail; tapdisk_syslog_sock_unmask(log); busy: if (log->oom) { err = -ENOBUFS; goto oom; } err = tapdisk_syslog_ring_write_str(log, log->msg, len); if (!err) return 0; log->oom_tv = now; oom: log->oom++; log->stats.drops++; return err; fail: log->stats.fails++; return err; } int tapdisk_syslog(td_syslog_t *log, int prio, const char *fmt, ...) { va_list ap; int err; va_start(ap, fmt); err = tapdisk_vsyslog(log, prio, fmt, ap); va_end(ap); return err; } static int tapdisk_syslog_sock_send(td_syslog_t *log, const void *msg, size_t size) { ssize_t n; log->stats.xmits++; n = send(log->sock, msg, size, MSG_DONTWAIT); if (n < 0) return -errno; return 0; } static void tapdisk_syslog_sock_event(event_id_t id, char mode, void *private) { td_syslog_t *log = private; tapdisk_syslog_ring_dispatch(log); if (log->cons == log->prod) tapdisk_syslog_sock_mask(log); } static void __tapdisk_syslog_sock_init(td_syslog_t *log) { log->sock = -1; log->event_id = -1; } static void tapdisk_syslog_sock_close(td_syslog_t *log) { if (log->sock >= 0) close(log->sock); if (log->event_id >= 0) tapdisk_server_unregister_event(log->event_id); __tapdisk_syslog_sock_init(log); } static int tapdisk_syslog_sock_open(td_syslog_t *log) { event_id_t id; int s, err; __tapdisk_syslog_sock_init(log); s = socket(PF_UNIX, SOCK_DGRAM, 0); if (s < 0) { err = -errno; goto fail; } log->sock = s; #if 0 err = fcntl(s, F_SETFL, O_NONBLOCK); if (err < 0) { err = -errno; goto fail; } #endif id = tapdisk_server_register_event(SCHEDULER_POLL_WRITE_FD, s, 0, tapdisk_syslog_sock_event, log); if (id < 0) { err = id; goto fail; } log->event_id = id; tapdisk_syslog_sock_mask(log); return 0; fail: tapdisk_syslog_sock_close(log); return err; } static int tapdisk_syslog_sock_connect(td_syslog_t *log) { int err; err = connect(log->sock, &syslog_addr, sizeof(syslog_addr)); if (err < 0) err = -errno; return err; } static void tapdisk_syslog_sock_mask(td_syslog_t *log) { tapdisk_server_mask_event(log->event_id, 1); } static void tapdisk_syslog_sock_unmask(td_syslog_t *log) { tapdisk_server_mask_event(log->event_id, 0); } void __tapdisk_syslog_init(td_syslog_t *log) { memset(log, 0, sizeof(td_syslog_t)); __tapdisk_syslog_sock_init(log); __tapdisk_syslog_ring_init(log); } void tapdisk_syslog_close(td_syslog_t *log) { tapdisk_syslog_ring_uninit(log); tapdisk_syslog_sock_close(log); if (log->ident) free(log->ident); __tapdisk_syslog_init(log); } int tapdisk_syslog_open(td_syslog_t *log, const char *ident, int facility, size_t bufsz) { int err; __tapdisk_syslog_init(log); log->facility = facility; log->ident = ident ? strndup(ident, TD_SYSLOG_IDENT_MAX) : NULL; err = tapdisk_syslog_sock_open(log); if (err) goto fail; err = tapdisk_syslog_ring_init(log, bufsz); if (err) goto fail; return 0; fail: tapdisk_syslog_close(log); return err; } void tapdisk_syslog_stats(td_syslog_t *log, int prio) { struct _td_syslog_stats *s = &log->stats; tapdisk_syslog(log, prio, "tapdisk-syslog: %llu messages, %llu bytes, " "xmits: %llu, failed: %llu, dropped: %llu", s->count, s->bytes, s->xmits, s->fails, s->drops); } void tapdisk_syslog_flush(td_syslog_t *log) { while (log->cons != log->prod) tapdisk_server_iterate(); } blktap-2.0.90/drivers/td-rated.1.txt0000644000000000000000000001426211664745551015662 0ustar rootroot SYNOPSIS td-rated -type {token|leaky|meminfo} -- [options] DESCRIPTION The td-rated 'bridge' is a daemon program to which one or a number of tapdisk processes connect, in order to cooperatively limit the data rate at which they will issue I/O requests to physical storage. A data rate denotes I/O bandwidth, i.e. an (average) amount of data over time. A rate limiter is a state machine dispatching an overall queue of incoming I/O requests, at a desired data rate. The td-rated program included a number of alternative rate limiting algorithms for various purposes. Rate limiters are discussed below. The standard client implementation in tapdisk is a transparent filter driver, of type name 'valve'. Valves are typically inserted at either the top of certain level of the disk image stack constituting a VDI, thereby uniformly limiting any I/O issued. Every bridge process constitutes a single rate limiter. Arbitrary numbers of client valves can connect to each bridge. I/O requests issued by clients are normally aggregated, dividing the available bandwidth among all active clients. OPTIONS Token Bucket Token bucket is a rate limiter which drains a request queue of pending I/O requests at a given overall data rate. It is invoked as follows: td-rated -t token -- .. --rate Bandwidth limit [B/s]. --cap Burst (aggregated credit) limit [B]. Token bucket's main feature over basic constant-rate algorithms (leaky buckets) is that it allows for I/O bursts. Bursts are batches of data request, which are preferably issued simultaneously to reduce the overall number of seeks involved on shared rotational media. With bursty I/O transfers, bandwidth may transiently exceed the nominal data rate, but in a controlled fashion. Different from a constant rate output, the I/O output rate is maintained as an average over periods of time. Internally, bursts issued at any time instant consume bandwidth credit ('tokens'). Credit gets accumulated, at the given rate, over time. Once exhausted, credit taken must be amortized before additional I/O can pass. That is, while the rate set will limit an output data rate, it does so only indirectly, by limiting the rate at which new credit is assigned. The cap argument is a limit to accumulated credit. Excess credit above the given capacity will be discarded. Caps limit the maximum burst size observable. The maximum only becomes available whenever all clients remained idle for for a time perid of cap/rate. A token bucket allows for bursts, it does not promote or enforce them at. Once configured bandwidth credit is exeeded, amortization time is applied to client request batches individually, in the order in which they were issued, and output will effectively degrade to a constant data rate. Leaky Bucket Leaky bucket is a simpler constant rate algorithm. Requests are issued in a round-robin fashion. The given rate is never exceeded, so requests. This is presently equivalent to a token bucket with a cap value of zero, and therefore implemented accordingly. td-rated -t leaky -- .. --rate Bandwidth limit [B/s]. Meminfo Driver Meminfo is an experimental rate limiting driver aiming specifically at write bandwidth reduction for tapdisk I/O modes targeting the host OS buffer cache. It is invoked as follows td-rated -t meminfo -- .. --high [% of total memory] --low [% of total memory] [--period