pax_global_header00006660000000000000000000000064141446477460014533gustar00rootroot0000000000000052 comment=6bb085f5704dd4c3841c79504f2aed2228e6d76a sbd-1.5.1/000077500000000000000000000000001414464774600123075ustar00rootroot00000000000000sbd-1.5.1/AUTHORS000066400000000000000000000000001414464774600133450ustar00rootroot00000000000000sbd-1.5.1/COPYING000066400000000000000000000432541414464774600133520ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. sbd-1.5.1/ChangeLog000066400000000000000000000000001414464774600140470ustar00rootroot00000000000000sbd-1.5.1/INSTALL000066400000000000000000000363321414464774600133470ustar00rootroot00000000000000Installation Instructions ************************* Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. Copying and distribution of this file, with or without modification, are permitted in any medium without royalty provided the copyright notice and this notice are preserved. This file is offered as-is, without warranty of any kind. Basic Installation ================== Briefly, the shell commands `./configure; make; make install' should configure, build, and install this package. The following more-detailed instructions are generic; see the `README' file for instructions specific to this package. Some packages provide this `INSTALL' file but do not implement all of the features documented below. The lack of an optional feature in a given package is not necessarily a bug. More recommendations for GNU packages can be found in *note Makefile Conventions: (standards)Makefile Conventions. The `configure' shell script attempts to guess correct values for various system-dependent variables used during compilation. It uses those values to create a `Makefile' in each directory of the package. It may also create one or more `.h' files containing system-dependent definitions. Finally, it creates a shell script `config.status' that you can run in the future to recreate the current configuration, and a file `config.log' containing compiler output (useful mainly for debugging `configure'). It can also use an optional file (typically called `config.cache' and enabled with `--cache-file=config.cache' or simply `-C') that saves the results of its tests to speed up reconfiguring. Caching is disabled by default to prevent problems with accidental use of stale cache files. If you need to do unusual things to compile the package, please try to figure out how `configure' could check whether to do them, and mail diffs or instructions to the address given in the `README' so they can be considered for the next release. If you are using the cache, and at some point `config.cache' contains results you don't want to keep, you may remove or edit it. The file `configure.ac' (or `configure.in') is used to create `configure' by a program called `autoconf'. You need `configure.ac' if you want to change it or regenerate `configure' using a newer version of `autoconf'. The simplest way to compile this package is: 1. `cd' to the directory containing the package's source code and type `./configure' to configure the package for your system. Running `configure' might take a while. While running, it prints some messages telling which features it is checking for. 2. Type `make' to compile the package. 3. Optionally, type `make check' to run any self-tests that come with the package, generally using the just-built uninstalled binaries. 4. Type `make install' to install the programs and any data files and documentation. When installing into a prefix owned by root, it is recommended that the package be configured and built as a regular user, and only the `make install' phase executed with root privileges. 5. Optionally, type `make installcheck' to repeat any self-tests, but this time using the binaries in their final installed location. This target does not install anything. Running this target as a regular user, particularly if the prior `make install' required root privileges, verifies that the installation completed correctly. 6. You can remove the program binaries and object files from the source code directory by typing `make clean'. To also remove the files that `configure' created (so you can compile the package for a different kind of computer), type `make distclean'. There is also a `make maintainer-clean' target, but that is intended mainly for the package's developers. If you use it, you may have to get all sorts of other programs in order to regenerate files that came with the distribution. 7. Often, you can also type `make uninstall' to remove the installed files again. In practice, not all packages have tested that uninstallation works correctly, even though it is required by the GNU Coding Standards. 8. Some packages, particularly those that use Automake, provide `make distcheck', which can by used by developers to test that all other targets like `make install' and `make uninstall' work correctly. This target is generally not run by end users. Compilers and Options ===================== Some systems require unusual options for compilation or linking that the `configure' script does not know about. Run `./configure --help' for details on some of the pertinent environment variables. You can give `configure' initial values for configuration parameters by setting variables in the command line or in the environment. Here is an example: ./configure CC=c99 CFLAGS=-g LIBS=-lposix *Note Defining Variables::, for more details. Compiling For Multiple Architectures ==================================== You can compile the package for more than one kind of computer at the same time, by placing the object files for each architecture in their own directory. To do this, you can use GNU `make'. `cd' to the directory where you want the object files and executables to go and run the `configure' script. `configure' automatically checks for the source code in the directory that `configure' is in and in `..'. This is known as a "VPATH" build. With a non-GNU `make', it is safer to compile the package for one architecture at a time in the source code directory. After you have installed the package for one architecture, use `make distclean' before reconfiguring for another architecture. On MacOS X 10.5 and later systems, you can create libraries and executables that work on multiple system types--known as "fat" or "universal" binaries--by specifying multiple `-arch' options to the compiler but only a single `-arch' option to the preprocessor. Like this: ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ CPP="gcc -E" CXXCPP="g++ -E" This is not guaranteed to produce working output in all cases, you may have to build one architecture at a time and combine the results using the `lipo' tool if you have problems. Installation Names ================== By default, `make install' installs the package's commands under `/usr/local/bin', include files under `/usr/local/include', etc. You can specify an installation prefix other than `/usr/local' by giving `configure' the option `--prefix=PREFIX', where PREFIX must be an absolute file name. You can specify separate installation prefixes for architecture-specific files and architecture-independent files. If you pass the option `--exec-prefix=PREFIX' to `configure', the package uses PREFIX as the prefix for installing programs and libraries. Documentation and other data files still use the regular prefix. In addition, if you use an unusual directory layout you can give options like `--bindir=DIR' to specify different values for particular kinds of files. Run `configure --help' for a list of the directories you can set and what kinds of files go in them. In general, the default for these options is expressed in terms of `${prefix}', so that specifying just `--prefix' will affect all of the other directory specifications that were not explicitly provided. The most portable way to affect installation locations is to pass the correct locations to `configure'; however, many packages provide one or both of the following shortcuts of passing variable assignments to the `make install' command line to change installation locations without having to reconfigure or recompile. The first method involves providing an override variable for each affected directory. For example, `make install prefix=/alternate/directory' will choose an alternate location for all directory configuration variables that were expressed in terms of `${prefix}'. Any directories that were specified during `configure', but not in terms of `${prefix}', must each be overridden at install time for the entire installation to be relocated. The approach of makefile variable overrides for each directory variable is required by the GNU Coding Standards, and ideally causes no recompilation. However, some platforms have known limitations with the semantics of shared libraries that end up requiring recompilation when using this method, particularly noticeable in packages that use GNU Libtool. The second method involves providing the `DESTDIR' variable. For example, `make install DESTDIR=/alternate/directory' will prepend `/alternate/directory' before all installation names. The approach of `DESTDIR' overrides is not required by the GNU Coding Standards, and does not work on platforms that have drive letters. On the other hand, it does better at avoiding recompilation issues, and works well even when some directory options were not specified in terms of `${prefix}' at `configure' time. Optional Features ================= If the package supports it, you can cause programs to be installed with an extra prefix or suffix on their names by giving `configure' the option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. Some packages pay attention to `--enable-FEATURE' options to `configure', where FEATURE indicates an optional part of the package. They may also pay attention to `--with-PACKAGE' options, where PACKAGE is something like `gnu-as' or `x' (for the X Window System). The `README' should mention any `--enable-' and `--with-' options that the package recognizes. For packages that use the X Window System, `configure' can usually find the X include and library files automatically, but if it doesn't, you can use the `configure' options `--x-includes=DIR' and `--x-libraries=DIR' to specify their locations. Some packages offer the ability to configure how verbose the execution of `make' will be. For these packages, running `./configure --enable-silent-rules' sets the default to minimal output, which can be overridden with `make V=1'; while running `./configure --disable-silent-rules' sets the default to verbose, which can be overridden with `make V=0'. Particular systems ================== On HP-UX, the default C compiler is not ANSI C compatible. If GNU CC is not installed, it is recommended to use the following options in order to use an ANSI C compiler: ./configure CC="cc -Ae -D_XOPEN_SOURCE=500" and if that doesn't work, install pre-built binaries of GCC for HP-UX. On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot parse its `' header file. The option `-nodtk' can be used as a workaround. If GNU CC is not installed, it is therefore recommended to try ./configure CC="cc" and if that doesn't work, try ./configure CC="cc -nodtk" On Solaris, don't put `/usr/ucb' early in your `PATH'. This directory contains several dysfunctional programs; working variants of these programs are available in `/usr/bin'. So, if you need `/usr/ucb' in your `PATH', put it _after_ `/usr/bin'. On Haiku, software installed for all users goes in `/boot/common', not `/usr/local'. It is recommended to use the following options: ./configure --prefix=/boot/common Specifying the System Type ========================== There may be some features `configure' cannot figure out automatically, but needs to determine by the type of machine the package will run on. Usually, assuming the package is built to be run on the _same_ architectures, `configure' can figure that out, but if it prints a message saying it cannot guess the machine type, give it the `--build=TYPE' option. TYPE can either be a short name for the system type, such as `sun4', or a canonical name which has the form: CPU-COMPANY-SYSTEM where SYSTEM can have one of these forms: OS KERNEL-OS See the file `config.sub' for the possible values of each field. If `config.sub' isn't included in this package, then this package doesn't need to know the machine type. If you are _building_ compiler tools for cross-compiling, you should use the option `--target=TYPE' to select the type of system they will produce code for. If you want to _use_ a cross compiler, that generates code for a platform different from the build platform, you should specify the "host" platform (i.e., that on which the generated programs will eventually be run) with `--host=TYPE'. Sharing Defaults ================ If you want to set default values for `configure' scripts to share, you can create a site shell script called `config.site' that gives default values for variables like `CC', `cache_file', and `prefix'. `configure' looks for `PREFIX/share/config.site' if it exists, then `PREFIX/etc/config.site' if it exists. Or, you can set the `CONFIG_SITE' environment variable to the location of the site script. A warning: not all `configure' scripts look for a site script. Defining Variables ================== Variables not defined in a site shell script can be set in the environment passed to `configure'. However, some packages may run configure again during the build, and the customized values of these variables may be lost. In order to avoid this problem, you should set them in the `configure' command line, using `VAR=value'. For example: ./configure CC=/usr/local2/bin/gcc causes the specified `gcc' to be used as the C compiler (unless it is overridden in the site shell script). Unfortunately, this technique does not work for `CONFIG_SHELL' due to an Autoconf bug. Until the bug is fixed you can use this workaround: CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash `configure' Invocation ====================== `configure' recognizes the following options to control how it operates. `--help' `-h' Print a summary of all of the options to `configure', and exit. `--help=short' `--help=recursive' Print a summary of the options unique to this package's `configure', and exit. The `short' variant lists options used only in the top level, while the `recursive' variant lists options also present in any nested packages. `--version' `-V' Print the version of Autoconf used to generate the `configure' script, and exit. `--cache-file=FILE' Enable the cache: use and save the results of the tests in FILE, traditionally `config.cache'. FILE defaults to `/dev/null' to disable caching. `--config-cache' `-C' Alias for `--cache-file=config.cache'. `--quiet' `--silent' `-q' Do not print messages saying which checks are being made. To suppress all normal output, redirect it to `/dev/null' (any error messages will still be shown). `--srcdir=DIR' Look for the package's source code in directory DIR. Usually `configure' can determine that directory automatically. `--prefix=DIR' Use DIR as the installation prefix. *note Installation Names:: for more details, including other options available for fine-tuning the installation locations. `--no-create' `-n' Run the configure checks, but stop before creating any output files. `configure' also accepts some other, not widely useful, options. Run `configure --help' for more details. sbd-1.5.1/Makefile.am000066400000000000000000000101531414464774600143430ustar00rootroot00000000000000SUBDIRS = src agent man tests # .gz because github doesn't support .xz yet :-( # this is modified # TAG ?= $(shell git log --pretty="format:%H" -n 1 || sed -n -e "s/%global longcommit\s*//p" sbd.spec)$(shell test -n "$$(git status -s)" && echo -n "-mod") SPEC_VERSION ?= $(shell sed -n -e "s/Version:\s*//p" sbd.spec) distdir = $(PACKAGE)-$(TAG) TARFILE = $(distdir).tar.gz DIST_ARCHIVES = $(TARFILE) KEEP_EXISTING_TAR = no INJECT_GIT_COMMIT = yes CLEANFILES = *.rpm *.tar.* sbd-* DISTCLEANFILES = sbd-* sbd-*/ RPM_ROOT = $(shell pwd) RPM_OPTS = --define "_sourcedir $(RPM_ROOT)" \ --define "_specdir $(RPM_ROOT)" \ --define "_srcrpmdir $(RPM_ROOT)" \ --define "_builddir $(RPM_ROOT)" \ --define "_rpmdir $(RPM_ROOT)" MOCK_TARGET ?= rhel-7.1-candidate-x86_64 MOCK_OPTIONS ?= --resultdir=$(RPM_ROOT)/mock --no-cleanup-after BUILD_COUNTER ?= build.counter COUNT = $(shell expr 1 + 0$(shell cat $(BUILD_COUNTER))) COMMIT_COUNTER ?= $(shell git describe --tags --long --always --match v$(SPEC_VERSION) | cut -f 2 -d- -s) TESTS = tests/regressions.sh export SBD_BINARY := src/sbd export SBD_PRELOAD := tests/.libs/libsbdtestbed.so export SBD_USE_DM := no EXTRA_DIST = sbd.spec tests/regressions.sh man/sbd.8.pod.in pkgconfigdir = $(datadir)/pkgconfig pkgconfig_DATA = sbd.pc export: rm -f $(PACKAGE)-HEAD.tar.* if test "$(KEEP_EXISTING_TAR)" != "yes"; then \ rm -f $(TARFILE); \ fi; ! (git status -s | grep "??" && echo "untracked files present in git-repo" ) if [ -f $(TARFILE) ]; then \ echo `date`: Using existing tarball: $(TARFILE); \ else \ rm -f $(PACKAGE).tar.*; \ (git archive --prefix=$(distdir)/ $(shell echo $(TAG)|cut -f1 -d-) || tar -c --transform="s,^,$(distdir)/," --exclude="*.tar.*" --exclude="$(distdir)" --exclude="*.o" --exclude="*.8" --exclude="config.*" --exclude="libtool" --exclude="ltmain.sh*" --exclude="Makefile" --exclude="Makefile.in" --exclude="stamp-*" --exclude="*.service" --exclude="sbd" --exclude="*.m4" --exclude="*.cache" --exclude="configure" --exclude="*.list" --exclude="depcomp" --exclude="install-sh" --exclude="missing" --exclude="compile" --exclude="sbd.sh" --exclude="sbd.sysconfig" --exclude="~" --exclude="*.swp" --exclude="*.patch" --exclude="*.diff" --exclude="*.orig" --exclude="*.rej" --exclude="*.rpm" --exclude="*.pod" --exclude=".deps" --exclude="test-driver" --exclude="sbd.pc" --exclude="build.counter" *) | gzip > $(TARFILE); \ if test -n "$$(git status -s)" || test "$(INJECT_GIT_COMMIT)" = "yes"; then \ if test -n "$$(git status -s)"; then git diff HEAD --name-only|grep -v "^\."|xargs -n1 git diff HEAD > uncommitted.diff; fi; \ rm -rf $(distdir); tar -xzf $(TARFILE); rm $(TARFILE); \ cd $(distdir); \ if test -n "$$(git status -s)"; then patch -p1 -i ../uncommitted.diff; fi; \ cd ..; \ sed -i 's/global\ commit.*/global\ commit\ $(TAG)/' $(distdir)/$(PACKAGE).spec; \ tar -czf $(TARFILE) $(distdir); rm -rf $(distdir); \ rm -f uncommitted.diff; \ fi; \ echo `date`: Rebuilt $(TARFILE); \ fi #replace commit id in sbd.spec spec: rm -f *.src.rpm rm -rf $(distdir) mkdir $(distdir) cp $(PACKAGE).spec $(distdir) sed -i 's/global\ longcommit.*/global\ longcommit\ $(TAG)/' $(distdir)/$(PACKAGE).spec if [ -e $(BUILD_COUNTER) ]; then \ sed -i 's/global\ build_counter.*/global\ build_counter\ $(COUNT)/' $(distdir)/$(PACKAGE).spec; \ echo $(COUNT) > $(BUILD_COUNTER); \ fi if [ -n "$(COMMIT_COUNTER)" ]; then \ sed -i 's/global\ commit_counter.*/global\ commit_counter\ $(COMMIT_COUNTER)/' $(distdir)/$(PACKAGE).spec; \ fi srpm: export spec rpmbuild $(RPM_OPTS) -bs $(distdir)/$(PACKAGE).spec rpm: export spec rpmbuild $(RPM_OPTS) -ba $(distdir)/$(PACKAGE).spec mock: srpm -rm -rf $(RPM_ROOT)/mock @echo "mock --root=$* --rebuild $(MOCK_OPTIONS) $(RPM_ROOT)/*.src.rpm" mock --root=$(MOCK_TARGET) --rebuild $(MOCK_OPTIONS) $(RPM_ROOT)/*.src.rpm beekhof: mock cluster-helper -- 'rm -f sbd-*.x86_64.rpm' cluster-helper --copy $(RPM_ROOT)/mock/sbd-*.x86_64.rpm {}: cluster-helper -- yum install -y sbd-*.x86_64.rpm sbd-1.5.1/NEWS000066400000000000000000000007501414464774600130100ustar00rootroot000000000000002013-07-04 - v1.2.0 This is, finally, a public release after several years of development and living mostly in SUSE Linux Enterprise High Availability 11 (and previously, cluster-glue). "sbd" ("shared-storage based death", named originally in deference to Novell Cluster Suite's "split brain detector") is a highly reliable fencing mechanism that utilizes shared storage for arbitration and exchange of fencing commands. New in this release: - Initial public release - Move to github sbd-1.5.1/README000066400000000000000000000000001414464774600131550ustar00rootroot00000000000000sbd-1.5.1/README.md000066400000000000000000000007451414464774600135740ustar00rootroot00000000000000# Shared-storage based death # A highly reliable fencing or Shoot-the-other-node-in-the-head (STONITH) mechanism that works by utilizing shared storage. The component works with Pacemaker clusters, and is currently known to compile and function on Pacemaker 1.1.7+ or 2.0.0+ and corosync 1.4.x, 2.3.x or 3.0.x. Please see https://github.com/clusterlabs/sbd/blob/master/man/sbd.8.pod.in & https://github.com/clusterlabs/sbd/blob/master/src/sbd.sysconfig for the full documentation. sbd-1.5.1/agent/000077500000000000000000000000001414464774600134055ustar00rootroot00000000000000sbd-1.5.1/agent/Makefile.am000066400000000000000000000001031414464774600154330ustar00rootroot00000000000000agentdir = $(libdir)/stonith/plugins/external agent_SCRIPTS = sbd sbd-1.5.1/agent/sbd.in000066400000000000000000000113521414464774600145070ustar00rootroot00000000000000#!/bin/bash # # This STONITH script drives the shared-storage stonith plugin. # # Copyright (C) 2013 Lars Marowsky-Bree # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # Main code if [ -z "$sbd_device" ]; then if [ -f @CONFIGDIR@/sbd ]; then source @CONFIGDIR@/sbd sbd_device=$SBD_DEVICE fi fi SBD_DEVS=${sbd_device%;} sbd_device=${SBD_DEVS//;/ -d } sbd_check_device() { if [ -z "$sbd_device" ]; then ha_log.sh err "No sbd device(s) found in the configuration." exit 1 fi } sbd_validate_timeout() { case "$timeout_bypass" in yes|true|1|YES|TRUE|ja|on|ON) return ;; esac crm_timeout=$[$(crm_attribute -t crm_config -G -n stonith-timeout -d 20s -q | sed -e 's/\(.*\)s/\1/' -e 's/\(.*\)m/\1*60/')] sbd_timeout=$(sbd -d $sbd_device dump | perl -ne 'if (/msgwait.*: (\d+)/) { print "$1\n"; }' | head -n 1) if [ -z "$sbd_timeout" -o "$sbd_timeout" = "0" ]; then return fi sbd_timeout_min=$[$sbd_timeout*12/10] if [ "$sbd_timeout_min" -lt 20 ]; then sbd_timeout_min=20 fi sbd_timeout_suggested=$[$sbd_timeout_min*12/10] if [ "$crm_timeout" -lt "$sbd_timeout_min" ]; then ha_log.sh err "The CIB property stonith-timeout is set too low for sbd to ever succeed" ha_log.sh err "Recommended value is $sbd_timeout_suggested, updating configuration." crm_attribute -t crm_config -n stonith-timeout -v $sbd_timeout_suggested exit 1 fi } case $1 in gethosts) sbd_check_device echo `sbd -d $sbd_device list | cut -f2 | sort | uniq` exit 0 ;; off|reset) sbd_check_device sbd_validate_timeout message=$1 case "$crashdump" in yes|true|1|YES|TRUE|ja|on|ON) message="crashdump" ;; esac sbd -d $sbd_device message $2 $message exit $? ;; status) sbd_check_device sbd_validate_timeout error_output=$(sbd -d $sbd_device list 2>&1 >/dev/null) if [ $? -ne 0 ]; then error_message=$(echo "$error_output" | grep -v "please check the logs") ha_log.sh err "sbd list failed: $error_message" exit 1 fi exit 0 ;; on) exit 1 ;; getconfignames) echo "sbd_device crashdump timeout_bypass" exit 0 ;; getinfo-devid) echo "Shared storage STONITH device" exit 0 ;; getinfo-devname) echo "Shared storage STONITH device" exit 0 ;; getinfo-devdescr) cat << DESC sbd uses a shared storage device as a medium to communicate fencing requests. This allows clusters without network power switches; the downside is that access to the shared storage device becomes a Single Point of Failure. It requires sbd to be configured on all nodes. Please read http://linux-ha.org/wiki/SBD_Fencing! DESC exit 0 ;; getinfo-devurl) echo "http://linux-ha.org/wiki/SBD_Fencing" exit 0 ;; getinfo-xml) cat << SSHXML Crashdump instead of regular fence If SBD is given a fence command, this option will instead perform a kernel crash of a reboot or power-off, which on a properly configured system can lead to a crashdump for analysis. This is less safe for production environments. Please use with caution and for debugging purposes only. SBD device(s) The block device used for the SBD partition. Up to three can be specified if separated by a semicolon. (Please check the documentation if specifying two.) If not specified, will default to the value from @CONFIGDIR@/sbd. Permit a seemingly too short stonith-timeout The sbd agent will try to detect a too short stonith-timeout (relative to msgwait) in the Pacemaker configuration and automatically correct it. Should that logic fail in your environment or you have legitimate need to use a shorter timeout, you can disable it via this parameter. SSHXML exit 0 ;; *) exit 1 ;; esac sbd-1.5.1/autogen.sh000077500000000000000000000005151414464774600143110ustar00rootroot00000000000000#!/bin/sh am_ver=`automake --version | sed -n 1p` case $am_ver in *\ 1.11*|*\ 1.12*) echo 'm4_define([TESTS_OPTION], [])';; *) echo 'm4_define([TESTS_OPTION], [serial-tests])';; esac > tests-opt.m4 cat tests-opt.m4 # Run this to generate all the initial makefiles, etc. autoreconf -i -v && echo Now run ./configure and make sbd-1.5.1/configure.ac000066400000000000000000000315211414464774600145770ustar00rootroot00000000000000dnl dnl autoconf for Agents dnl dnl License: GNU General Public License (GPL) dnl =============================================== dnl Bootstrap dnl =============================================== AC_PREREQ(2.63) dnl Suggested structure: dnl information on the package dnl checks for programs dnl checks for libraries dnl checks for header files dnl checks for types dnl checks for structures dnl checks for compiler characteristics dnl checks for library functions dnl checks for system services AC_INIT([sbd], [1.5.1], [lmb@suse.com]) m4_include([tests-opt.m4]) AC_CANONICAL_HOST AC_CONFIG_AUX_DIR(.) AC_CONFIG_HEADERS(config.h) m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([no])]) AM_INIT_AUTOMAKE(1.11.1 foreign TESTS_OPTION) LT_INIT([dlopen],[disable-static]) AM_PROG_CC_C_O # expand_path_option $path_variable_name $default expand_path_option() { # The first argument is the variable *name* (not value) ac_path_varname="$1" # Get the original value of the variable ac_path_value=$(eval echo "\${${ac_path_varname}}") # Expand any literal variable expressions in the value so that we don't # end up with something like '${prefix}' in #defines etc. # # Autoconf deliberately leaves values unexpanded to allow overriding # the configure script choices in make commands (for example, # "make exec_prefix=/foo install"). No longer being able to do this seems # like no great loss. eval ac_path_value=$(eval echo "${ac_path_value}") # Use (expanded) default if necessary AS_IF([test x"${ac_path_value}" = x""], [eval ac_path_value=$(eval echo "$2")]) # Require a full path AS_CASE(["$ac_path_value"], [/*], [eval ${ac_path_varname}="$ac_path_value"], [*], [AC_MSG_ERROR([$ac_path_varname value "$ac_path_value" is not a full path])] ) } PKG_CHECK_MODULES(glib, [glib-2.0]) PKG_CHECK_MODULES(libxml, [libxml-2.0]) PKG_CHECK_MODULES(cmap, [libcmap], HAVE_cmap=1, HAVE_cmap=0) PKG_CHECK_MODULES(votequorum, [libvotequorum], HAVE_votequorum=1, HAVE_votequorum=0) dnl pacemaker > 1.1.8 PKG_CHECK_MODULES(pacemaker, [pacemaker, pacemaker-cib], HAVE_pacemaker=1, HAVE_pacemaker=0) dnl pacemaker <= 1.1.8 PKG_CHECK_MODULES(pcmk, [pcmk, pcmk-cib], HAVE_pcmk=1, HAVE_pcmk=0) PKG_CHECK_MODULES(libqb, [libqb]) CPPFLAGS="$CPPFLAGS -Werror $glib_CFLAGS $libxml_CFLAGS" LIBS="$LIBS $glib_LIBS $libxml_LIBS" if test $HAVE_pacemaker = 0 -a $HAVE_pcmk = 0; then AC_MSG_ERROR(No package 'pacemaker' found) elif test $HAVE_pacemaker = 1; then CPPFLAGS="$CPPFLAGS $glib_CFLAGS $pacemaker_CFLAGS" if test $HAVE_cmap = 0; then AC_MSG_NOTICE(No library 'cmap' found) else CPPFLAGS="$CPPFLAGS $cmap_CFLAGS" LIBS="$LIBS $cmap_LIBS" fi if test $HAVE_votequorum = 0; then AC_MSG_NOTICE(No library 'votequorum' found) else CPPFLAGS="$CPPFLAGS $votequorum_CFLAGS" LIBS="$LIBS $votequorum_LIBS" fi fi CPPFLAGS="$CPPFLAGS $libqb_CFLAGS $pacemaker_CFLAGS $pcmk_CFLAGS" LIBS="$LIBS $libqb_LIBS $pacemaker_LIBS $pcmk_LIBS" dnl checks for libraries AC_CHECK_LIB(c, dlopen) dnl if dlopen is in libc... AC_CHECK_LIB(dl, dlopen) dnl -ldl (for Linux) AC_CHECK_LIB(aio, io_setup, , missing="yes") AC_CHECK_LIB(qb, qb_ipcs_connection_auth_set, , missing="yes") AC_CHECK_LIB(cib, cib_new, , missing="yes") AC_CHECK_LIB(crmcommon, set_crm_log_level, , missing="yes") AC_CHECK_LIB(pe_status, pe_find_node, , missing="yes") AC_CHECK_LIB(pe_rules, test_rule, , missing="yes") AC_CHECK_LIB(crmcluster, crm_peer_init, , missing="yes") AC_CHECK_LIB(uuid, uuid_unparse, , missing="yes") AC_CHECK_LIB(cmap, cmap_initialize, , HAVE_cmap=0) AC_CHECK_LIB(votequorum, votequorum_getinfo, , HAVE_votequorum=0) AC_CHECK_LIB(crmcommon, pcmk_pacemakerd_api_ping, HAVE_pacemakerd_api=1, HAVE_pacemakerd_api=0) dnl pacemaker >= 1.1.8 AC_CHECK_HEADERS(crm/cluster.h) AC_CHECK_LIB(crmcommon, pcmk_strerror, , missing="yes") AC_CHECK_LIB(cib, cib_apply_patch_event, , missing="yes") dnl pacemaker-2.0 removed support for corosync 1 cluster layer AC_CHECK_DECLS([pcmk_cluster_classic_ais, pcmk_cluster_cman],,, [#include ]) dnl check for additional no-quorum-policies dnl AC_TEST_NO_QUORUM_POLICY(POLICY) AC_DEFUN([AC_TEST_NO_QUORUM_POLICY],[ AC_MSG_CHECKING([whether enum pe_quorum_policy defines value $1]) AC_LANG_PUSH([C]) AC_COMPILE_IFELSE([AC_LANG_PROGRAM( [#include ], [enum pe_quorum_policy policy = $1; return policy;])], AC_DEFINE_UNQUOTED(m4_toupper(HAVE_ENUM_$1), 1, [Does pe_types.h have $1 value in enum pe_quorum_policy?]) AC_MSG_RESULT([yes]), AC_MSG_RESULT([no])) AC_LANG_POP([C]) ]) AC_TEST_NO_QUORUM_POLICY(no_quorum_demote) dnl check for new pe-API AC_CHECK_FUNCS(pe_new_working_set) dnl check if votequorum comes with default for qdevice-sync_timeout AC_CHECK_DECLS([VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT], HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=1, HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT=0, [#include ]) if test "$missing" = "yes"; then AC_MSG_ERROR([Missing required libraries or functions.]) fi AC_PATH_PROGS(POD2MAN, pod2man, pod2man) AC_ARG_ENABLE([shared-disk], [ --enable-shared-disk Turn on functionality that requires shared disk [default=yes]]) DISK=0 if test "x${enable_shared_disk}" != xno ; then DISK=1 fi AC_DEFINE_UNQUOTED(SUPPORT_SHARED_DISK, $DISK, Turn on functionality that requires shared disk) AM_CONDITIONAL(SUPPORT_SHARED_DISK, test "$DISK" = "1") if test -e /proc/$$ then echo "/proc/{pid} is supported" AC_DEFINE_UNQUOTED(HAVE_PROC_PID, 1, Define to 1 if /proc/{pid} is supported.) fi AC_DEFINE_UNQUOTED(CHECK_TWO_NODE, $HAVE_cmap, Turn on checking for 2-node cluster) AM_CONDITIONAL(CHECK_TWO_NODE, test "$HAVE_cmap" = "1") AC_DEFINE_UNQUOTED(CHECK_VOTEQUORUM_HANDLE, $HAVE_votequorum, Turn on periodic checking of votequorum-handle) AM_CONDITIONAL(CHECK_VOTEQUORUM_HANDLE, test "$HAVE_votequorum" = "1") AC_DEFINE_UNQUOTED(CHECK_QDEVICE_SYNC_TIMEOUT, ($HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT && $HAVE_cmap), Turn on checking if watchdog-timeout and qdevice-sync_timeout are matching) AM_CONDITIONAL(CHECK_QDEVICE_SYNC_TIMEOUT, test "$HAVE_DECL_VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT" = "1" && test "$HAVE_cmap" = "1") AC_DEFINE_UNQUOTED(USE_PACEMAKERD_API, $HAVE_pacemakerd_api, Turn on synchronization between sbd & pacemakerd) AM_CONDITIONAL(USE_PACEMAKERD_API, test "$HAVE_pacemakerd_api" = "1") CONFIGDIR="" AC_ARG_WITH(configdir, [ --with-configdir=DIR Directory for SBD configuration file [${CONFIGDIR}]], [ CONFIGDIR="$withval" ] ) dnl --runstatedir is available as of autoconf 2.70 (2020-12-08). When users dnl have an older version, they can use our --with-runstatedir. sbd_runstatedir="" AC_ARG_WITH([runstatedir], [AS_HELP_STRING([--with-runstatedir=DIR], [modifiable per-process data @<:@LOCALSTATEDIR/run@:>@ (ignored if --runstatedir is available)])], [ sbd_runstatedir="$withval" ] ) SBD_WATCHDOG_TIMEOUT_DEFAULT="" AC_ARG_WITH(watchdog-timeout-default, [ --with-watchdog-timeout-default=SECONDS Timeout in seconds SBD will configure the watchdog per default], [ SBD_WATCHDOG_TIMEOUT_DEFAULT="$withval" ] ) SBD_SYNC_RESOURCE_STARTUP_DEFAULT="" AC_ARG_WITH(sync-resource-startup-default, [ --with-sync-resource-startup-default=yes|no Default for SBD_SYNC_RESOURCE_STARTUP if not given in SBD configuration file explicitly], [ SBD_SYNC_RESOURCE_STARTUP_DEFAULT="$withval" ] ) SBD_SYNC_RESOURCE_STARTUP_SYSCONFIG="" AC_ARG_WITH(sync-resource-startup-sysconfig, [ --with-sync-resource-startup-sysconfig=yes|no Value for SBD_SYNC_RESOURCE_STARTUP going into template SBD configuration file], [ SBD_SYNC_RESOURCE_STARTUP_SYSCONFIG="$withval" ] ) # # Where is dlopen? # if test "$ac_cv_lib_c_dlopen" = yes; then LIBADD_DL="" elif test "$ac_cv_lib_dl_dlopen" = yes; then LIBADD_DL=-ldl else LIBADD_DL=${lt_cv_dlopen_libs} fi dnl ********************************************************************** dnl Check for various argv[] replacing functions on various OSs dnl dnl Borrowed from Proftpd dnl Proftpd is Licenced under the terms of the GNU General Public Licence dnl and is available from http://www.proftpd.org/ dnl AC_CHECK_FUNCS(setproctitle) AC_CHECK_HEADERS(libutil.h) AC_CHECK_LIB(util, setproctitle, [AC_DEFINE(HAVE_SETPROCTITLE,1,[ ]) ac_cv_func_setproctitle="yes" ; LIBS="$LIBS -lutil"]) if test "$ac_cv_func_setproctitle" = "yes"; then pf_argv_set="PF_ARGV_NONE" fi if test "$pf_argv_set" = ""; then AC_CHECK_HEADERS(sys/pstat.h) if test "$ac_cv_header_pstat_h" = "yes"; then AC_CHECK_FUNCS(pstat) if test "$ac_cv_func_pstat" = "yes"; then pf_argv_set="PF_ARGV_PSTAT" else pf_argv_set="PF_ARGV_WRITEABLE" fi fi if test "$pf_argv_set" = ""; then AC_EGREP_HEADER([#define.*PS_STRINGS.*],sys/exec.h, have_psstrings="yes",have_psstrings="no") if test "$have_psstrings" = "yes"; then pf_argv_set="PF_ARGV_PSSTRINGS" fi fi if test "$pf_argv_set" = ""; then AC_CACHE_CHECK(whether __progname and __progname_full are available, pf_cv_var_progname, AC_TRY_LINK([extern char *__progname, *__progname_full;], [__progname = "foo"; __progname_full = "foo bar";], pf_cv_var_progname="yes", pf_cv_var_progname="no")) if test "$pf_cv_var_progname" = "yes"; then AC_DEFINE(HAVE___PROGNAME,1,[ ]) fi AC_CACHE_CHECK(which argv replacement method to use, pf_cv_argv_type, AC_EGREP_CPP(yes,[ #if defined(__GNU_HURD__) yes #endif ],pf_cv_argv_type="new", pf_cv_argv_type="writeable")) if test "$pf_cv_argv_type" = "new"; then pf_argv_set="PF_ARGV_NEW" fi if test "$pf_argv_set" = ""; then pf_argv_set="PF_ARGV_WRITEABLE" fi fi fi AC_DEFINE_UNQUOTED(PF_ARGV_TYPE, $pf_argv_set, mechanism to pretty-print ps output: setproctitle-equivalent) dnl End of tests borrowed from Proftpd AC_MSG_NOTICE(Sanitizing prefix: ${prefix}) case $prefix in NONE) prefix=/usr dnl Fix default variables - "prefix" variable if not specified if test "$localstatedir" = "\${prefix}/var"; then localstatedir="/var" fi if test "$sysconfdir" = "\${prefix}/etc"; then sysconfdir="/etc" fi ;; esac AC_MSG_NOTICE(Sanitizing exec_prefix: ${exec_prefix}) case $exec_prefix in dnl For consistency with Heartbeat, map NONE->$prefix NONE) exec_prefix=$prefix;; prefix) exec_prefix=$prefix;; esac dnl Expand values of autoconf-provided directory options expand_path_option prefix expand_path_option exec_prefix expand_path_option bindir expand_path_option sbindir expand_path_option libexecdir expand_path_option datadir expand_path_option sysconfdir expand_path_option sharedstatedir expand_path_option localstatedir expand_path_option libdir expand_path_option includedir expand_path_option oldincludedir expand_path_option infodir expand_path_option mandir AS_IF([test x"${runstatedir}" = x""], [runstatedir="${sbd_runstatedir}"]) expand_path_option runstatedir "${localstatedir}/run" AC_SUBST(runstatedir) AC_SUBST(LIBADD_DL) dnl extra flags for dynamic linking libraries expand_path_option CONFIGDIR "${sysconfdir}/sysconfig" AC_SUBST(CONFIGDIR) if test x"${SBD_WATCHDOG_TIMEOUT_DEFAULT}" = x""; then case "$host_cpu" in s390|s390x) SBD_WATCHDOG_TIMEOUT_DEFAULT=15 ;; *) SBD_WATCHDOG_TIMEOUT_DEFAULT=5 ;; esac fi AC_SUBST(SBD_WATCHDOG_TIMEOUT_DEFAULT) AC_DEFINE_UNQUOTED(SBD_WATCHDOG_TIMEOUT_DEFAULT, $SBD_WATCHDOG_TIMEOUT_DEFAULT, Timeout in seconds SBD will configure the watchdog per default) if test x"${SBD_SYNC_RESOURCE_STARTUP_DEFAULT}" = x""; then SBD_SYNC_RESOURCE_STARTUP_DEFAULT=yes fi AC_SUBST(SBD_SYNC_RESOURCE_STARTUP_DEFAULT) dnl rather pass to C as a string and interpret there for consistent interpretation AC_DEFINE_UNQUOTED(SBD_SYNC_RESOURCE_STARTUP_DEFAULT, "${SBD_SYNC_RESOURCE_STARTUP_DEFAULT}", Default for SBD_SYNC_RESOURCE_STARTUP if not given in SBD configuration file explicitly) if test x"${SBD_SYNC_RESOURCE_STARTUP_SYSCONFIG}" = x""; then SBD_SYNC_RESOURCE_STARTUP_SYSCONFIG=${SBD_SYNC_RESOURCE_STARTUP_DEFAULT} fi AC_SUBST(SBD_SYNC_RESOURCE_STARTUP_SYSCONFIG) dnl The Makefiles and shell scripts we output AC_CONFIG_FILES([Makefile src/Makefile agent/Makefile man/Makefile tests/Makefile agent/sbd src/sbd.service src/sbd_remote.service src/sbd.sh src/sbd.sysconfig sbd.pc]) dnl Now process the entire list of files added by previous dnl calls to AC_CONFIG_FILES() AC_OUTPUT() sbd-1.5.1/man/000077500000000000000000000000001414464774600130625ustar00rootroot00000000000000sbd-1.5.1/man/Makefile.am000066400000000000000000000011201414464774600151100ustar00rootroot00000000000000dist_man_MANS = sbd.8 DISTCLEANFILES = sbd.8.pod sbd.8 sbd.sysconfig.pod sbd.sysconfig.pod: ../src/sbd.sysconfig sed -r -n -e "s/^## Type: (.*)/Allows C<\1>/;t type;s/^## Default: (.*)/ defaulting to C<\1>/;t default;s/^#*(.*)=.*/=item B<\1>\n/;t variable;s/^#*//;s/^ *//;H;d;:type;h;d;:default;H;x;s/\n//;x;d;:variable;G;p" $< > $@ sbd.8.pod: sbd.8.pod.in sbd.sysconfig.pod sed -e "s,\@runstatedir\@,$(runstatedir)," $< |sed -e "s/@environment_section@//;t insert;p;d;:insert;rsbd.sysconfig.pod" > $@ sbd.8: sbd.8.pod @POD2MAN@ -s 8 -c "STONITH Block Device" -r "SBD" -n "SBD" $< $@ sbd-1.5.1/man/sbd.8.pod.in000066400000000000000000000551641414464774600151240ustar00rootroot00000000000000=head1 NAME sbd - STONITH Block Device daemon =head1 SYNOPSIS sbd <-d F> [options] C =head1 SUMMARY SBD provides a node fencing mechanism (Shoot the other node in the head, STONITH) for Pacemaker-based clusters through the exchange of messages via shared block storage such as for example a SAN, iSCSI, FCoE. This isolates the fencing mechanism from changes in firmware version or dependencies on specific firmware controllers, and it can be used as a STONITH mechanism in all configurations that have reliable shared storage. SBD can also be used without any shared storage. In this mode, the watchdog device will be used to reset the node if it loses quorum, if any monitored daemon is lost and not recovered or if Pacemaker decides that the node requires fencing. The F binary implements both the daemon that watches the message slots as well as the management tool for interacting with the block storage device(s). This mode of operation is specified via the C parameter; some of these modes take additional parameters. To use SBD with shared storage, you must first C the messaging layout on one to three block devices. Second, configure F to list those devices (and possibly adjust other options), and restart the cluster stack on each node to ensure that C is started. Third, configure the C fencing resource in the Pacemaker CIB. Each of these steps is documented in more detail below the description of the command options. C can only be used as root. =head2 GENERAL OPTIONS =over =item B<-d> F Specify the block device(s) to be used. If you have more than one, specify this option up to three times. This parameter is mandatory for all modes, since SBD always needs a block device to interact with. This man page uses F, F, and F as example device names for brevity. However, in your production environment, you should instead always refer to them by using the long, stable device name (e.g., F). =item B<-v|-vv|-vvv> Enable verbose|debug|debug-library logging (optional) =item B<-h> Display a concise summary of C options. =item B<-n> I Set local node name; defaults to C. This should not need to be set. =item B<-R> Do B enable realtime priority. By default, C runs at realtime priority, locks itself into memory, and also acquires highest IO priority to protect itself against interference from other processes on the system. This is a debugging-only option. =item B<-I> I Async IO timeout (defaults to 3 seconds, optional). You should not need to adjust this unless your IO setup is really very slow. (In daemon mode, the watchdog is refreshed when the majority of devices could be read within this time.) =back =head2 create Example usage: sbd -d /dev/sdc2 -d /dev/sdd3 create If you specify the I command, sbd will write a metadata header to the device(s) specified and also initialize the messaging slots for up to 255 nodes. B: This command will not prompt for confirmation. Roughly the first megabyte of the specified block device(s) will be overwritten immediately and without backup. This command accepts a few options to adjust the default timings that are written to the metadata (to ensure they are identical across all nodes accessing the device). =over =item B<-1> I Set watchdog timeout to N seconds. This depends mostly on your storage latency; the majority of devices must be successfully read within this time, or else the node will self-fence. If your sbd device(s) reside on a multipath setup or iSCSI, this should be the time required to detect a path failure. You may be able to reduce this if your device outages are independent, or if you are using the Pacemaker integration. =item B<-2> I Set slot allocation timeout to N seconds. You should not need to tune this. =item B<-3> I Set daemon loop timeout to N seconds. You should not need to tune this. =item B<-4> I Set I timeout to N seconds. This should be twice the I timeout. This is the time after which a message written to a node's slot will be considered delivered. (Or long enough for the node to detect that it needed to self-fence.) This also affects the I in Pacemaker's CIB; see below. =back =head2 list Example usage: # sbd -d /dev/sda1 list 0 hex-0 clear 1 hex-7 clear 2 hex-9 clear List all allocated slots on device, and messages. You should see all cluster nodes that have ever been started against this device. Nodes that are currently running should have a I state; nodes that have been fenced, but not yet restarted, will show the appropriate fencing message. =head2 dump Example usage: # sbd -d /dev/sda1 dump ==Dumping header on disk /dev/sda1 Header version : 2 Number of slots : 255 Sector size : 512 Timeout (watchdog) : 15 Timeout (allocate) : 2 Timeout (loop) : 1 Timeout (msgwait) : 30 ==Header on disk /dev/sda1 is dumped Dump meta-data header from device. =head2 watch Example usage: sbd -d /dev/sdc2 -d /dev/sdd3 -P watch This command will make C start in daemon mode. It will constantly monitor the message slot of the local node for incoming messages, reachability, and optionally take Pacemaker's state into account. C B be started on boot before the cluster stack! See below for enabling this according to your boot environment. The options for this mode are rarely specified directly on the commandline directly, but most frequently set via F. It also constantly monitors connectivity to the storage device, and self-fences in case the partition becomes unreachable, guaranteeing that it does not disconnect from fencing messages. A node slot is automatically allocated on the device(s) the first time the daemon starts watching the device; hence, manual allocation is not usually required. If a watchdog is used together with the C as is strongly recommended, the watchdog is activated at initial start of the sbd daemon. The watchdog is refreshed every time the majority of SBD devices has been successfully read. Using a watchdog provides additional protection against C crashing. If the Pacemaker integration is activated, C will B self-fence if device majority is lost, if: =over =item 1. The partition the node is in is still quorate according to the CIB; =item 2. it is still quorate according to Corosync's node count; =item 3. the node itself is considered online and healthy by Pacemaker. =back This allows C to survive temporary outages of the majority of devices. However, while the cluster is in such a degraded state, it can neither successfully fence nor be shutdown cleanly (as taking the cluster below the quorum threshold will immediately cause all remaining nodes to self-fence). In short, it will not tolerate any further faults. Please repair the system before continuing. There is one C process that acts as a master to which all watchers report; one per device to monitor the node's slot; and, optionally, one that handles the Pacemaker integration. =over =item B<-W> Enable or disable use of the system watchdog to protect against the sbd processes failing and the node being left in an undefined state. Specify this once to enable, twice to disable. Defaults to I. =item B<-w> F This can be used to override the default watchdog device used and should not usually be necessary. =item B<-p> F<@runstatedir@/sbd.pid> This option can be used to specify a pidfile for the main sbd process. =item B<-F> I Number of failures before a failing servant process will not be restarted immediately until the dampening delay has expired. If set to zero, servants will be restarted immediately and indefinitely. If set to one, a failed servant will be restarted once every B<-t> seconds. If set to a different value, the servant will be restarted that many times within the dampening period and then delay. Defaults to I<1>. =item B<-t> I Dampening delay before faulty servants are restarted. Combined with C<-F 1>, the most logical way to tune the restart frequency of servant processes. Default is 5 seconds. If set to zero, processes will be restarted indefinitely and immediately. =item B<-P> Enable Pacemaker integration which checks Pacemaker quorum and node health. Specify this once to enable, twice to disable. Defaults to I. =item B<-S> I Set the start mode. (Defaults to I<0>.) If this is set to zero, sbd will always start up unconditionally, regardless of whether the node was previously fenced or not. If set to one, sbd will only start if the node was previously shutdown cleanly (as indicated by an exit request message in the slot), or if the slot is empty. A reset, crashdump, or power-off request in any slot will halt the start up. This is useful to prevent nodes from rejoining if they were faulty. The node must be manually "unfenced" by sending an empty message to it: sbd -d /dev/sda1 message node1 clear =item B<-s> I Set the start-up wait time for devices. (Defaults to I<120>.) Dynamic block devices such as iSCSI might not be fully initialized and present yet. This allows one to set a timeout for waiting for devices to appear on start-up. If set to 0, start-up will be aborted immediately if no devices are available. =item B<-Z> Enable trace mode. B Specifying this once will turn all reboots or power-offs, be they caused by self-fence decisions or messages, into a crashdump. Specifying this twice will just log them but not continue running. =item B<-T> By default, the daemon will set the watchdog timeout as specified in the device metadata. However, this does not work for every watchdog device. In this case, you must manually ensure that the watchdog timeout used by the system correctly matches the SBD settings, and then specify this option to allow C to continue with start-up. =item B<-5> I Warn if the time interval for tickling the watchdog exceeds this many seconds. Since the node is unable to log the watchdog expiry (it reboots immediately without a chance to write its logs to disk), this is very useful for getting an indication that the watchdog timeout is too short for the IO load of the system. Default is about 3/5 of watchdog timeout, set to zero to disable. =item B<-C> I Watchdog timeout to set before crashdumping. If SBD is set to crashdump instead of reboot - either via the trace mode settings or the I fencing agent's parameter -, SBD will adjust the watchdog timeout to this setting before triggering the dump. Otherwise, the watchdog might trigger and prevent a successful crashdump from ever being written. Set to zero (= default) to disable. =item B<-r> I Actions to be executed when the watchers don't timely report to the sbd master process or one of the watchers detects that the master process has died. Set timeout-action to comma-separated combination of noflush|flush plus reboot|crashdump|off. If just one of both is given the other stays at the default. This doesn't affect actions like off, crashdump, reboot explicitly triggered via message slots. And it does as well not configure the action a watchdog would trigger should it run off (there is no generic interface). Defaults to flush,reboot. =back =head2 allocate Example usage: sbd -d /dev/sda1 allocate node1 Explicitly allocates a slot for the specified node name. This should rarely be necessary, as every node will automatically allocate itself a slot the first time it starts up on watch mode. =head2 message Example usage: sbd -d /dev/sda1 message node1 test Writes the specified message to node's slot. This is rarely done directly, but rather abstracted via the C fencing agent configured as a cluster resource. Supported message types are: =over =item test This only generates a log message on the receiving node and can be used to check if SBD is seeing the device. Note that this could overwrite a fencing request send by the cluster, so should not be used during production. =item reset Reset the target upon receipt of this message. =item off Power-off the target. =item crashdump Cause the target node to crashdump. =item exit This will make the C daemon exit cleanly on the target. You should B send this message manually; this is handled properly during shutdown of the cluster stack. Manually stopping the daemon means the node is unprotected! =item clear This message indicates that no real message has been sent to the node. You should not set this manually; C will clear the message slot automatically during start-up, and setting this manually could overwrite a fencing message by the cluster. =back =head2 query-watchdog Example usage: sbd query-watchdog Check for available watchdog devices and print some info. B: This command will arm the watchdog during query, and if your watchdog refuses disarming (for example, if its kernel module has the 'nowayout' parameter set) this will reset your system. =head2 test-watchdog Example usage: sbd test-watchdog [-w /dev/watchdog3] Test specified watchdog device (/dev/watchdog by default). B: This command will arm the watchdog and have your system reset in case your watchdog is working properly! If issued from an interactive session, it will prompt for confirmation. =head1 Base system configuration =head2 Configure a watchdog It is highly recommended that you configure your Linux system to load a watchdog driver with hardware assistance (as is available on most modern systems), such as I, I, or others. As a fall-back, you can use the I module. No other software must access the watchdog timer; it can only be accessed by one process at any given time. Some hardware vendors ship systems management software that use the watchdog for system resets (f.e. HP ASR daemon). Such software has to be disabled if the watchdog is to be used by SBD. =head2 Choosing and initializing the block device(s) First, you have to decide if you want to use one, two, or three devices. If you are using multiple ones, they should reside on independent storage setups. Putting all three of them on the same logical unit for example would not provide any additional redundancy. The SBD device can be connected via Fibre Channel, Fibre Channel over Ethernet, or even iSCSI. Thus, an iSCSI target can become a sort-of network-based quorum server; the advantage is that it does not require a smart host at your third location, just block storage. The SBD partitions themselves B be mirrored (via MD, DRBD, or the storage layer itself), since this could result in a split-mirror scenario. Nor can they reside on cLVM2 volume groups, since they must be accessed by the cluster stack before it has started the cLVM2 daemons; hence, these should be either raw partitions or logical units on (multipath) storage. The block device(s) must be accessible from all nodes. (While it is not necessary that they share the same path name on all nodes, this is considered a very good idea.) SBD will only use about one megabyte per device, so you can easily create a small partition, or very small logical units. (The size of the SBD device depends on the block size of the underlying device. Thus, 1MB is fine on plain SCSI devices and SAN storage with 512 byte blocks. On the IBM s390x architecture in particular, disks default to 4k blocks, and thus require roughly 4MB.) The number of devices will affect the operation of SBD as follows: =over =item One device In its most simple implementation, you use one device only. This is appropriate for clusters where all your data is on the same shared storage (with internal redundancy) anyway; the SBD device does not introduce an additional single point of failure then. If the SBD device is not accessible, the daemon will fail to start and inhibit startup of cluster services. =item Two devices This configuration is a trade-off, primarily aimed at environments where host-based mirroring is used, but no third storage device is available. SBD will not commit suicide if it loses access to one mirror leg; this allows the cluster to continue to function even in the face of one outage. However, SBD will not fence the other side while only one mirror leg is available, since it does not have enough knowledge to detect an asymmetric split of the storage. So it will not be able to automatically tolerate a second failure while one of the storage arrays is down. (Though you can use the appropriate crm command to acknowledge the fence manually.) It will not start unless both devices are accessible on boot. =item Three devices In this most reliable and recommended configuration, SBD will only self-fence if more than one device is lost; hence, this configuration is resilient against temporary single device outages (be it due to failures or maintenance). Fencing messages can still be successfully relayed if at least two devices remain accessible. This configuration is appropriate for more complex scenarios where storage is not confined to a single array. For example, host-based mirroring solutions could have one SBD per mirror leg (not mirrored itself), and an additional tie-breaker on iSCSI. It will only start if at least two devices are accessible on boot. =back After you have chosen the devices and created the appropriate partitions and perhaps multipath alias names to ease management, use the C command described above to initialize the SBD metadata on them. =head3 Sharing the block device(s) between multiple clusters It is possible to share the block devices between multiple clusters, provided the total number of nodes accessing them does not exceed I<255> nodes, and they all must share the same SBD timeouts (since these are part of the metadata). If you are using multiple devices this can reduce the setup overhead required. However, you should B share devices between clusters in different security domains. =head2 Configure SBD to start on boot On systems using C, the C or C system start-up scripts must handle starting or stopping C as required before starting the rest of the cluster stack. For C, sbd simply has to be enabled using systemctl enable sbd.service The daemon is brought online on each node before corosync and Pacemaker are started, and terminated only after all other cluster components have been shut down - ensuring that cluster resources are never activated without SBD supervision. =head2 Configuration via sysconfig The system instance of C is configured via F. In this file, you must specify the device(s) used, as well as any options to pass to the daemon: SBD_DEVICE="/dev/sda1;/dev/sdb1;/dev/sdc1" SBD_PACEMAKER="true" C will fail to start if no C is specified. See the installed template or section for configuration via environment for more options that can be configured here. In general configuration done via parameters takes precedence over the configuration from the configuration file. =head2 Configuration via environment =over @environment_section@ =back =head2 Testing the sbd installation After a restart of the cluster stack on this node, you can now try sending a test message to it as root, from this or any other node: sbd -d /dev/sda1 message node1 test The node will acknowledge the receipt of the message in the system logs: Aug 29 14:10:00 node1 sbd: [13412]: info: Received command test from node2 This confirms that SBD is indeed up and running on the node, and that it is ready to receive messages. Make B that F is identical on all cluster nodes, and that all cluster nodes are running the daemon. =head1 Pacemaker CIB integration =head2 Fencing resource Pacemaker can only interact with SBD to issue a node fence if there is a configure fencing resource. This should be a primitive, not a clone, as follows: primitive fencing-sbd stonith:external/sbd \ params pcmk_delay_max=30 This will automatically use the same devices as configured in F. While you should not configure this as a clone (as Pacemaker will register the fencing device on each node automatically), the I setting enables random fencing delay which ensures, in a scenario where a split-brain scenario did occur in a two node cluster, that one of the nodes has a better chance to survive to avoid double fencing. SBD also supports turning the reset request into a crash request, which may be helpful for debugging if you have kernel crashdumping configured; then, every fence request will cause the node to dump core. You can enable this via the C parameter on the fencing resource. This is B recommended for production use, but only for debugging phases. =head2 General cluster properties You must also enable STONITH in general, and set the STONITH timeout to be at least twice the I timeout you have configured, to allow enough time for the fencing message to be delivered. If your I timeout is 60 seconds, this is a possible configuration: property stonith-enabled="true" property stonith-timeout="120s" B: if I is too low for I and the system overhead, sbd will never be able to successfully complete a fence request. This will create a fencing loop. Note that the sbd fencing agent will try to detect this and automatically extend the I setting to a reasonable value, on the assumption that sbd modifying your configuration is preferable to not fencing. =head1 Management tasks =head2 Recovering from temporary SBD device outage If you have multiple devices, failure of a single device is not immediately fatal. C will retry to restart the monitor for the device every 5 seconds by default. However, you can tune this via the options to the I command. In case you wish the immediately force a restart of all currently disabled monitor processes, you can send a I to the SBD I process. =head1 LICENSE Copyright (C) 2008-2013 Lars Marowsky-Bree This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This software is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. For details see the GNU General Public License at http://www.gnu.org/licenses/gpl-2.0.html (version 2) and/or http://www.gnu.org/licenses/gpl.html (the newest as per "any later"). sbd-1.5.1/sbd.pc.in000066400000000000000000000002541414464774600140110ustar00rootroot00000000000000prefix=@prefix@ exec_prefix=@exec_prefix@ confdir=@CONFIGDIR@ Name: @PACKAGE@ Version: @VERSION@ Description: @PACKAGE@ build information required for downstream projects sbd-1.5.1/sbd.spec000066400000000000000000000273001414464774600137350ustar00rootroot00000000000000# # spec file for package sbd # # Copyright (c) 2014 SUSE LINUX Products GmbH, Nuernberg, Germany. # Copyright (c) 2013 Lars Marowsky-Bree # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed # upon. The license for this file, and modifications and additions to the # file, is the same license as for the pristine package itself (unless the # license for the pristine package is not an Open Source License, in which # case the license is the MIT License). An "Open Source License" is a # license that conforms to the Open Source Definition (Version 1.9) # published by the Open Source Initiative. # Please submit bugfixes or comments via http://bugs.opensuse.org/ # %global longcommit 7bcdf69597042c31ea0b4a523e732d4bbb99b3a0 %global shortcommit %(echo %{longcommit}|cut -c1-8) %global modified %(echo %{longcommit}-|cut -f2 -d-) %global github_owner Clusterlabs %global commit_counter 0 %global build_counter 0 %global buildnum %(expr %{commit_counter} + %{build_counter}) %ifarch s390x s390 # minimum timeout on LPAR diag288 watchdog is 15s %global watchdog_timeout_default 15 %else %global watchdog_timeout_default 5 %endif # Be careful with sync_resource_startup_default # being enabled. This configuration has # to be in sync with configuration in pacemaker # where it is called sbd_sync - assure by e.g. # mutual rpm dependencies. %bcond_without sync_resource_startup_default # Syncing enabled per default will lead to # syncing enabled on upgrade without adaption # of the config. # Setting can still be overruled via sysconfig. # The setting in the config-template packaged # will follow the default if below is is left # empty. But it is possible to have the setting # in the config-template deviate from the default # by setting below to an explicit 'yes' or 'no'. %global sync_resource_startup_sysconfig "" Name: sbd Summary: Storage-based death License: GPLv2+ Group: System Environment/Daemons Version: 1.5.1 Release: 99.%{buildnum}.%{shortcommit}.%{modified}git%{?dist} Url: https://github.com/%{github_owner}/%{name} Source0: https://github.com/%{github_owner}/%{name}/archive/%{longcommit}/%{name}-%{longcommit}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: autoconf BuildRequires: automake BuildRequires: libuuid-devel BuildRequires: glib2-devel BuildRequires: libaio-devel BuildRequires: corosync-devel %if 0%{?suse_version} BuildRequires: libpacemaker-devel %else BuildRequires: pacemaker-libs-devel %endif BuildRequires: libtool BuildRequires: libuuid-devel BuildRequires: libxml2-devel BuildRequires: pkgconfig BuildRequires: make Conflicts: fence-agents-sbd < 4.5.0 %if 0%{?rhel} > 0 ExclusiveArch: i686 x86_64 s390x aarch64 ppc64le %endif %if %{defined systemd_requires} %systemd_requires %endif %description This package contains the storage-based death functionality. Available rpmbuild rebuild options: --with(out) : sync_resource_startup_default %package tests Summary: Storage-based death environment for regression tests License: GPLv2+ Group: System Environment/Daemons %description tests This package provides an environment + testscripts for regression-testing sbd. %prep ########################################################### # %setup -n sbd-%{version} -q %setup -q -n %{name}-%{longcommit} ########################################################### %build ./autogen.sh export CFLAGS="$RPM_OPT_FLAGS -Wall -Werror" %configure --with-watchdog-timeout-default=%{watchdog_timeout_default} \ --with-sync-resource-startup-default=%{?with_sync_resource_startup_default:yes}%{!?with_sync_resource_startup_default:no} \ --with-sync-resource-startup-sysconfig=%{sync_resource_startup_sysconfig} \ --with-runstatedir=%{_rundir} make %{?_smp_mflags} ########################################################### %install ########################################################### make DESTDIR=$RPM_BUILD_ROOT LIBDIR=%{_libdir} install rm -rf ${RPM_BUILD_ROOT}%{_libdir}/stonith install -D -m 0755 src/sbd.sh $RPM_BUILD_ROOT/usr/share/sbd/sbd.sh install -D -m 0755 tests/regressions.sh $RPM_BUILD_ROOT/usr/share/sbd/regressions.sh %if %{defined _unitdir} install -D -m 0644 src/sbd.service $RPM_BUILD_ROOT/%{_unitdir}/sbd.service install -D -m 0644 src/sbd_remote.service $RPM_BUILD_ROOT/%{_unitdir}/sbd_remote.service %endif mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig install -m 644 src/sbd.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/sbd # Don't package static libs find %{buildroot} -name '*.a' -type f -print0 | xargs -0 rm -f find %{buildroot} -name '*.la' -type f -print0 | xargs -0 rm -f %clean rm -rf %{buildroot} %if %{defined _unitdir} %post %systemd_post sbd.service %systemd_post sbd_remote.service %preun %systemd_preun sbd.service %systemd_preun sbd_remote.service %postun %systemd_postun sbd.service %systemd_postun sbd_remote.service %endif %files ########################################################### %defattr(-,root,root) %config(noreplace) %{_sysconfdir}/sysconfig/sbd %{_sbindir}/sbd %{_datadir}/sbd %{_datadir}/pkgconfig/sbd.pc %exclude %{_datadir}/sbd/regressions.sh %doc %{_mandir}/man8/sbd* %if %{defined _unitdir} %{_unitdir}/sbd.service %{_unitdir}/sbd_remote.service %endif %doc COPYING %files tests %defattr(-,root,root) %dir %{_datadir}/sbd %{_datadir}/sbd/regressions.sh %{_libdir}/libsbdtestbed* %changelog * Mon Nov 15 2021 - 1.5.1-99.0.7bcdf695.git - improve/fix cmdline handling - tell the actual watchdog device specified with -w - tolerate and strip any leading spaces of commandline option values - Sanitize numeric arguments - if start-delay enabled, not explicitly given and msgwait can't be read from disk (diskless) use 2 * watchdog-timeout - avoid using deprecated valloc for disk-io-buffers - avoid frequent alloc/free of aligned buffers to prevent fragmentation - fix memory-leak in one-time-allocations of sector-buffers - fix AIO-API usage: properly destroy io-context - improve/fix build environment - validate configure options for paths - remove unneeded complexity of configure.ac hierarchy - correctly derive package version from git (regression since 1.5.0) - make runstatedir configurable and derive from distribution * Tue Jun 8 2021 - 1.5.0-99.0.2a00ac70.git - default to resource-syncing with pacemaker in spec-file and configure.ac This default has to match between sbd and pacemaker and thus qualifies this release for a minor-version-bump - fix some regressions introduced by adding configurability previously - adapt description of startup/shutdown sync with pacemaker - make watchdog warning messages more understandable * Wed Dec 2 2020 - 1.4.2-99.1.bfeee963.git - improve build/CI-friendlyness - * travis: switch to F32 as build-host - switch to F32 & leap-15.2 - changes for mock-2.0 - turn off loop-devices & device-mapper on x86_64 targets because - of changes in GCE - * regressions.sh: get timeouts from disk-header to go with proper defaults - for architecture - * use configure for watchdog-default-timeout & others - * ship sbd.pc with basic sbd build information for downstream packages - to use - * add number of commits since version-tag to build-counter - add robustness against misconfiguration / improve documentation - * add environment section to man-page previously just available in - template-config - * inform the user to restart the sbd service after disk-initialization - * refuse to start if any of the configured device names is invalid - * add handshake to sync startup/shutdown with pacemakerd - Previously sbd just waited for the cib-connnection to show up/go away - which isn't robust at all. - The new feature needs new pacemakerd-api as counterpart. - Thus build checks for presence of pacemakerd-api. - To simplify downstream adoption behavior is configurable at runtime - via configure-file with a build-time-configurable default. - * refuse to start if qdevice-sync_timeout doesn't match watchdog-timeout - Needed in particular as qdevice-sync_timeout delays quorum-state-update - and has a default of 30s that doesn't match the 5s watchdog-timeout - default. - Fix: sbd-pacemaker: handle new no_quorum_demote + robustness against new - policies added - Fix: agent: correctly compare string values when calculating timeout - Fix: scheduling: overhaul the whole thing - * prevent possible lockup when format in proc changes - * properly get and handle scheduler policy & prio - * on SCHED_RR failing push to the max with SCHED_OTHER * Tue Nov 19 2019 - 1.4.1-99.1.aca7907c.git - improvements/clarifications in documentation - properly finalize cmap connection when disconnected from cluster - make handling of cib-connection loss more robust - silence some coverity findings - overhaul log for reasonable prios and details - if current slice doesn't have rt-budget move to root-slice - periodically ping corosync daemon for liveness - actually use crashdump timeout if configured - avoid deprecated names for g_main-loop-funcitons - conflict with fence-agents-sbd < 4.5.0 - rather require corosync-devel provided by most distributions - make devices on cmdline overrule those coming via SBD_DEVICE - make 15s timeout on s390 be used consistently - improve build/test for CI-friendlyness - * add autogen.sh - * enable/improve out-of-tree-building - * make tar generation smarter - * don't modify sbd.spec - * make distcheck-target work - * Add tests/regressions.sh to check-target - * use unique devmapper names for multiple tests in parallel - * consistently use serial test-harness for visible progress - * package tests into separate package (not packaged before) - * add preload-library to intercept reboots while testing - * add tests for sbd in daemon-mode & watchdog-dev-handling - * make tests work in non-privileged containers * Mon Jan 14 2019 - 1.4.0-0.1.2d595fdd.git - updated travis-CI (ppc64le-build, fedora29, remove need for alectolytic-build-container) - make watchdog-device-query easier to be handled by an SELinux-policy - configurable delay value for SBD_DELAY_START - use pacemaker's new pe api with constructors/destructors - make timeout-action executed by sbd configurable - init script for sysv systems - version bump to v1.4.0 to denote Pacemaker 2.0.0 compatibility * Fri Jun 29 2018 - 1.3.1-0.1.e102d9ed.git - removed unneeded python-devel build-requirement - changed legacy corosync-devel to corosynclib-devel * Fri Nov 3 2017 - 1.3.1-0.1.a180176c.git - Add commands to test/query watchdogs - Allow 2-node-operation with a single shared-disk - Overhaul of the command-line options & config-file - Proper handling of off instead of reboot - Refactored disk-servant for more robust communication with parent - Fix config for Debian + configurable location of config - Fixes in sbd.sh - multiple SBD devices and others * Sun Mar 27 2016 - 1.3.0-0.1.4ee36fa3.git - Changes since v1.2.0 like adding the possibility to have a watchdog-only setup without shared-block-devices legitimate a bump to v1.3.0. * Mon Oct 13 2014 - 1.2.1-0.4.3de531ed.git - Fixes for suitability to the el7 environment * Tue Sep 30 2014 - 1.2.1-0.3.8f912945.git - Only build on archs supported by the HA Add-on * Fri Aug 29 2014 - 1.2.1-0.2.8f912945.git - Remove some additional SUSE-isms * Fri Aug 29 2014 - 1.2.1-0.1.8f912945.git - Prepare for package review Resolves: rhbz#1134245 sbd-1.5.1/src/000077500000000000000000000000001414464774600130765ustar00rootroot00000000000000sbd-1.5.1/src/Makefile.am000066400000000000000000000005331414464774600151330ustar00rootroot00000000000000AM_CFLAGS = -D_GNU_SOURCE -DCHECK_AIS -DSBINDIR=\"$(sbindir)\" AM_CPPFLAGS = -I$(includedir)/pacemaker \ -I$(includedir)/heartbeat \ $(glib_CFLAGS) sbin_PROGRAMS = sbd sbd_SOURCES = sbd-common.c sbd-inquisitor.c sbd-pacemaker.c sbd-cluster.c setproctitle.c sbd.h sbd.sysconfig if SUPPORT_SHARED_DISK sbd_SOURCES += sbd-md.c endif sbd-1.5.1/src/sbd-cluster.c000066400000000000000000000545111414464774600154770ustar00rootroot00000000000000/* * Copyright (C) 2013 Lars Marowsky-Bree * * Based on crm_mon.c, which was: * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include #include #include #include #include #include #include #include #include #include #if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT #include #endif #include "sbd.h" //undef SUPPORT_PLUGIN //define SUPPORT_PLUGIN 1 /* binary for pacemaker-remote has changed with pacemaker 2 */ #ifdef CRM_SCORE_INFINITY #define PACEMAKER_REMOTE_BINARY "pacemaker-remoted" #else #define PACEMAKER_REMOTE_BINARY "pacemaker_remoted" #endif static bool remote_node = false; static pid_t remoted_pid = 0; static int reconnect_msec = 1000; static GMainLoop *mainloop = NULL; static guint notify_timer = 0; static crm_cluster_t cluster; static gboolean sbd_remote_check(gpointer user_data); static long unsigned int find_pacemaker_remote(void); static void sbd_membership_destroy(gpointer user_data); #if SUPPORT_PLUGIN static void sbd_plugin_membership_dispatch(cpg_handle_t handle, const struct cpg_name *groupName, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { if(msg_len > 0) { set_servant_health(pcmk_health_online, LOG_INFO, "Connected to %s", name_for_cluster_type(get_cluster_type())); } else { set_servant_health(pcmk_health_unclean, LOG_WARNING, "Broken %s message", name_for_cluster_type(get_cluster_type())); } notify_parent(); return; } #endif #if SUPPORT_COROSYNC #if CHECK_VOTEQUORUM_HANDLE #include static votequorum_handle_t votequorum_handle = 0; #endif #if CHECK_TWO_NODE static bool two_node = false; #endif static bool ever_seen_both = false; static int cpg_membership_entries = -1; #if CHECK_QDEVICE_SYNC_TIMEOUT #include static bool using_qdevice = false; static uint32_t qdevice_sync_timeout = /* in seconds */ VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000; #endif #if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT #include static cmap_handle_t cmap_handle = 0; static cmap_track_handle_t track_handle = 0; static GSource *cmap_source = NULL; #endif void sbd_cpg_membership_health_update() { if(cpg_membership_entries > 0) { #if CHECK_TWO_NODE bool quorum_is_suspect_two_node = (two_node && ever_seen_both && cpg_membership_entries == 1); #endif #if CHECK_QDEVICE_SYNC_TIMEOUT bool quorum_is_suspect_qdevice_timing = using_qdevice && (qdevice_sync_timeout > timeout_watchdog); #endif do { #if CHECK_TWO_NODE if (quorum_is_suspect_two_node) { /* Alternative would be asking votequorum for number of votes. * Using pacemaker's cpg as source for number of active nodes * avoids binding to an additional library, is definitely * less code to write and we wouldn't have to combine data * from 3 sources (cmap, cpg & votequorum) in a potentially * racy environment. */ set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Connected to %s but requires both nodes present", name_for_cluster_type(get_cluster_type()) ); break; } #endif #if CHECK_QDEVICE_SYNC_TIMEOUT if (quorum_is_suspect_qdevice_timing) { /* We can't really trust quorum info as qdevice-sync_timeout * makes reaction of quorum too sluggish for our * watchdog-timeout. */ set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Connected to %s but quorum using qdevice is distrusted " "for SBD as qdevice-sync_timeout (%ds) > watchdog-timeout " "(%lus).", name_for_cluster_type(get_cluster_type()), qdevice_sync_timeout, timeout_watchdog ); break; } #endif set_servant_health(pcmk_health_online, LOG_INFO, "Connected to %s (%u members)%s", name_for_cluster_type(get_cluster_type()), cpg_membership_entries, #if CHECK_QDEVICE_SYNC_TIMEOUT using_qdevice?" using qdevice for quorum":"" #else "" #endif ); } while (false); if (cpg_membership_entries > 1) { ever_seen_both = true; } } else { set_servant_health(pcmk_health_unclean, LOG_WARNING, "Empty %s membership", name_for_cluster_type(get_cluster_type())); } } void sbd_cpg_membership_dispatch(cpg_handle_t handle, const struct cpg_name *groupName, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries) { cpg_membership_entries = member_list_entries; sbd_cpg_membership_health_update(); notify_parent(); } #if CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT static void sbd_cmap_notify_fn( cmap_handle_t cmap_handle, cmap_track_handle_t cmap_track_handle, int32_t event, const char *key_name, struct cmap_notify_value new_val, struct cmap_notify_value old_val, void *user_data) { switch (event) { case CMAP_TRACK_ADD: case CMAP_TRACK_MODIFY: switch (new_val.type) { case CMAP_VALUETYPE_UINT8: #if CHECK_TWO_NODE if (!strcmp(key_name, "quorum.two_node")) { two_node = *((uint8_t *) new_val.data); } else { return; } break; #else return; #endif case CMAP_VALUETYPE_STRING: #if CHECK_QDEVICE_SYNC_TIMEOUT if (!strcmp(key_name, "quorum.device.model")) { using_qdevice = ((new_val.data) && strlen((char *) new_val.data)); } else { return; } break; #else return; #endif case CMAP_VALUETYPE_UINT32: #if CHECK_QDEVICE_SYNC_TIMEOUT if (!strcmp(key_name, "quorum.device.sync_timeout")) { if (new_val.data) { qdevice_sync_timeout = *((uint32_t *) new_val.data) / 1000; } else { qdevice_sync_timeout = VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000; } } else { return; } break; #else return; #endif default: return; } break; case CMAP_TRACK_DELETE: switch (new_val.type) { case CMAP_VALUETYPE_UINT8: #if CHECK_TWO_NODE if (!strcmp(key_name, "quorum.two_node")) { two_node = false; } else { return; } break; #else return; #endif case CMAP_VALUETYPE_STRING: #if CHECK_QDEVICE_SYNC_TIMEOUT if (!strcmp(key_name, "quorum.device.model")) { using_qdevice = false; } else { return; } break; #else return; #endif case CMAP_VALUETYPE_UINT32: #if CHECK_QDEVICE_SYNC_TIMEOUT if (!strcmp(key_name, "quorum.device.sync_timeout")) { qdevice_sync_timeout = VOTEQUORUM_QDEVICE_DEFAULT_SYNC_TIMEOUT / 1000; } else { return; } break; #else return; #endif default: return; } break; default: return; } sbd_cpg_membership_health_update(); notify_parent(); } static gboolean cmap_dispatch_callback (gpointer user_data) { cmap_dispatch(cmap_handle, CS_DISPATCH_ALL); return TRUE; } static void cmap_destroy(void) { if (cmap_source) { g_source_destroy(cmap_source); cmap_source = NULL; } if (track_handle) { cmap_track_delete(cmap_handle, track_handle); track_handle = 0; } if (cmap_handle) { cmap_finalize(cmap_handle); cmap_handle = 0; } } static gboolean verify_against_cmap_config(void) { #if CHECK_TWO_NODE uint8_t two_node_u8 = 0; #endif #if CHECK_QDEVICE_SYNC_TIMEOUT char *qdevice_model = NULL; #endif int cmap_fd; if (!track_handle) { if (cmap_initialize(&cmap_handle) != CS_OK) { cl_log(LOG_WARNING, "Cannot initialize CMAP service\n"); goto out; } #if CHECK_TWO_NODE if (cmap_track_add(cmap_handle, "quorum.two_node", CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD, sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) { cl_log(LOG_WARNING, "Failed adding CMAP tracker for 2Node-mode\n"); goto out; } #endif #if CHECK_QDEVICE_SYNC_TIMEOUT if (cmap_track_add(cmap_handle, "quorum.device.model", CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD, sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) { cl_log(LOG_WARNING, "Failed adding CMAP tracker for qdevice-model\n"); goto out; } if (cmap_track_add(cmap_handle, "quorum.device.sync_timeout", CMAP_TRACK_DELETE|CMAP_TRACK_MODIFY|CMAP_TRACK_ADD, sbd_cmap_notify_fn, NULL, &track_handle) != CS_OK) { cl_log(LOG_WARNING, "Failed adding CMAP tracker for qdevice-sync_timeout\n"); goto out; } #endif /* add the tracker to mainloop */ if (cmap_fd_get(cmap_handle, &cmap_fd) != CS_OK) { cl_log(LOG_WARNING, "Failed to get a file handle for cmap\n"); goto out; } if (!(cmap_source = g_unix_fd_source_new (cmap_fd, G_IO_IN))) { cl_log(LOG_WARNING, "Couldn't create source for cmap\n"); goto out; } g_source_set_callback(cmap_source, cmap_dispatch_callback, NULL, NULL); g_source_attach(cmap_source, NULL); } #if CHECK_TWO_NODE if (cmap_get_uint8(cmap_handle, "quorum.two_node", &two_node_u8) == CS_OK) { cl_log(two_node_u8? LOG_NOTICE : LOG_INFO, "Corosync is%s in 2Node-mode", two_node_u8?"":" not"); two_node = two_node_u8; } else { cl_log(LOG_INFO, "quorum.two_node not present in cmap\n"); } #endif #if CHECK_QDEVICE_SYNC_TIMEOUT if (cmap_get_string(cmap_handle, "quorum.device.model", &qdevice_model) == CS_OK) { using_qdevice = qdevice_model && strlen(qdevice_model); cl_log(using_qdevice? LOG_NOTICE : LOG_INFO, "Corosync is%s using qdevice", using_qdevice?"":" not"); } else { cl_log(LOG_INFO, "quorum.device.model not present in cmap\n"); } if (cmap_get_uint32(cmap_handle, "quorum.device.sync_timeout", &qdevice_sync_timeout) == CS_OK) { qdevice_sync_timeout /= 1000; cl_log(LOG_INFO, "Corosync is using qdevice-sync_timeout=%ds", qdevice_sync_timeout); } else { cl_log(LOG_INFO, "quorum.device.sync_timeout not present in cmap\n"); } #endif return TRUE; out: cmap_destroy(); return FALSE; } #endif #endif static gboolean notify_timer_cb(gpointer data) { cl_log(LOG_DEBUG, "Refreshing %sstate", remote_node?"remote ":""); if(remote_node) { sbd_remote_check(NULL); return TRUE; } switch (get_cluster_type()) { #if HAVE_DECL_PCMK_CLUSTER_CLASSIC_AIS case pcmk_cluster_classic_ais: send_cluster_text(crm_class_quorum, NULL, TRUE, NULL, crm_msg_ais); break; #endif case pcmk_cluster_corosync: do { #if SUPPORT_COROSYNC && CHECK_VOTEQUORUM_HANDLE struct votequorum_info info; if (votequorum_getinfo(votequorum_handle, 0, &info) != CS_OK) { votequorum_finalize(votequorum_handle); if (votequorum_initialize(&votequorum_handle, NULL) != CS_OK) { votequorum_handle = 0; break; } if (votequorum_getinfo(votequorum_handle, 0, &info) != CS_OK) { break; } } #endif notify_parent(); } while (0); break; #if HAVE_DECL_PCMK_CLUSTER_CMAN case pcmk_cluster_cman: notify_parent(); break; #endif default: break; } return TRUE; } static void sbd_membership_connect(void) { bool connected = false; cl_log(LOG_INFO, "Attempting cluster connection"); cluster.destroy = sbd_membership_destroy; #if SUPPORT_PLUGIN cluster.cpg.cpg_deliver_fn = sbd_plugin_membership_dispatch; #endif #if SUPPORT_COROSYNC cluster.cpg.cpg_confchg_fn = sbd_cpg_membership_dispatch; #endif while(connected == false) { enum cluster_type_e stack = get_cluster_type(); if(get_cluster_type() == pcmk_cluster_unknown) { crm_debug("Attempting pacemaker remote connection"); /* Nothing is up, go looking for the pacemaker remote process */ if(find_pacemaker_remote() > 0) { connected = true; } } else { cl_log(LOG_INFO, "Attempting connection to %s", name_for_cluster_type(stack)); #if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT) if (verify_against_cmap_config()) { #endif if(crm_cluster_connect(&cluster)) { connected = true; } #if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT) } #endif } if(connected == false) { cl_log(LOG_INFO, "Failed, retrying in %ds", reconnect_msec / 1000); sleep(reconnect_msec / 1000); } } set_servant_health(pcmk_health_transient, LOG_INFO, "Connected, waiting for initial membership"); notify_parent(); notify_timer_cb(NULL); } static void sbd_membership_destroy(gpointer user_data) { cl_log(LOG_WARNING, "Lost connection to %s", name_for_cluster_type(get_cluster_type())); if (get_cluster_type() != pcmk_cluster_unknown) { #if SUPPORT_COROSYNC && (CHECK_TWO_NODE || CHECK_QDEVICE_SYNC_TIMEOUT) cmap_destroy(); #endif } set_servant_health(pcmk_health_unclean, LOG_ERR, "Cluster connection terminated"); notify_parent(); /* Attempt to reconnect, the watchdog will take the node down if the problem isn't transient */ sbd_membership_connect(); } /* * \internal * \brief Get process ID and name associated with a /proc directory entry * * \param[in] entry Directory entry (must be result of readdir() on /proc) * \param[out] name If not NULL, a char[16] to hold the process name * \param[out] pid If not NULL, will be set to process ID of entry * * \return 0 on success, -1 if entry is not for a process or info not found * * \note This should be called only on Linux systems, as not all systems that * support /proc store process names and IDs in the same way. * Copied from the Pacemaker implementation. */ int sbd_procfs_process_info(struct dirent *entry, char *name, int *pid) { int fd, local_pid; FILE *file; struct stat statbuf; char procpath[128] = { 0 }; /* We're only interested in entries whose name is a PID, * so skip anything non-numeric or that is too long. * * 114 = 128 - strlen("/proc/") - strlen("/status") - 1 */ local_pid = atoi(entry->d_name); if ((local_pid <= 0) || (strlen(entry->d_name) > 114)) { return -1; } if (pid) { *pid = local_pid; } /* Get this entry's file information */ strcpy(procpath, "/proc/"); strcat(procpath, entry->d_name); fd = open(procpath, O_RDONLY); if (fd < 0 ) { return -1; } if (fstat(fd, &statbuf) < 0) { close(fd); return -1; } close(fd); /* We're only interested in subdirectories */ if (!S_ISDIR(statbuf.st_mode)) { return -1; } /* Read the first entry ("Name:") from the process's status file. * We could handle the valgrind case if we parsed the cmdline file * instead, but that's more of a pain than it's worth. */ if (name != NULL) { strcat(procpath, "/status"); file = fopen(procpath, "r"); if (!file) { return -1; } if (fscanf(file, "Name:\t%15[a-zA-Z0-9 _-]", name) != 1) { fclose(file); return -1; } fclose(file); } return 0; } static gboolean sbd_remote_check(gpointer user_data) { static int have_proc_pid = 0; int running = 0; cl_log(LOG_DEBUG, "Checking pacemaker remote connection: %d/%d", have_proc_pid, remoted_pid); if(have_proc_pid == 0) { char proc_path[PATH_MAX], exe_path[PATH_MAX]; /* check to make sure pid hasn't been reused by another process */ snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)getpid()); have_proc_pid = 1; if(readlink(proc_path, exe_path, PATH_MAX - 1) < 0) { have_proc_pid = -1; } } if (remoted_pid <= 0) { set_servant_health(pcmk_health_transient, LOG_WARNING, "No Pacemaker Remote connection"); goto notify; } else if (kill(remoted_pid, 0) < 0 && errno == ESRCH) { /* Not running */ } else if(have_proc_pid == -1) { running = 1; cl_log(LOG_DEBUG, "Poccess %ld is active", (long)remoted_pid); } else { int rc = 0; char proc_path[PATH_MAX], exe_path[PATH_MAX]; /* check to make sure pid hasn't been reused by another process */ snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", (long unsigned int)remoted_pid); rc = readlink(proc_path, exe_path, PATH_MAX - 1); if (rc < 0) { crm_perror(LOG_ERR, "Could not read from %s", proc_path); goto done; } exe_path[rc] = 0; if (strcmp(exe_path, SBINDIR "/" PACEMAKER_REMOTE_BINARY) == 0) { cl_log(LOG_DEBUG, "Process %s (%ld) is active", exe_path, (long)remoted_pid); running = 1; } } done: if(running) { set_servant_health(pcmk_health_online, LOG_INFO, "Connected to Pacemaker Remote %lu", (long unsigned int)remoted_pid); } else { set_servant_health(pcmk_health_unclean, LOG_WARNING, "Connection to Pacemaker Remote %lu lost", (long unsigned int)remoted_pid); } notify: notify_parent(); if(running == 0) { sbd_membership_connect(); } return true; } static long unsigned int find_pacemaker_remote(void) { DIR *dp; char entry_name[16]; struct dirent *entry; dp = opendir("/proc"); if (!dp) { /* no proc directory to search through */ cl_log(LOG_NOTICE, "Can not read /proc directory to track existing components"); return FALSE; } while ((entry = readdir(dp)) != NULL) { int pid; if (sbd_procfs_process_info(entry, entry_name, &pid) < 0) { continue; } /* entry_name is truncated to 16 characters including the nul terminator */ cl_log(LOG_DEBUG, "Found %s at %u", entry_name, pid); if (strncmp(entry_name, PACEMAKER_REMOTE_BINARY, 15) == 0) { cl_log(LOG_NOTICE, "Found Pacemaker Remote at PID %u", pid); remoted_pid = pid; remote_node = true; break; } } closedir(dp); return remoted_pid; } static void clean_up(int rc) { #if CHECK_VOTEQUORUM_HANDLE votequorum_finalize(votequorum_handle); votequorum_handle = 0; /* there isn't really an invalid handle value * just to be back where we started */ #endif return; } static void cluster_shutdown(int nsig) { clean_up(0); } int servant_cluster(const char *diskname, int mode, const void* argp) { enum cluster_type_e cluster_stack = get_cluster_type(); crm_system_name = strdup("sbd:cluster"); cl_log(LOG_NOTICE, "Monitoring %s cluster health", name_for_cluster_type(cluster_stack)); set_proc_title("sbd: watcher: Cluster"); sbd_membership_connect(); /* stonith_our_uname = cluster.uname; */ /* stonith_our_uuid = cluster.uuid; */ mainloop = g_main_loop_new(NULL, FALSE); notify_timer = g_timeout_add(timeout_loop * 1000, notify_timer_cb, NULL); mainloop_add_signal(SIGTERM, cluster_shutdown); mainloop_add_signal(SIGINT, cluster_shutdown); g_main_loop_run(mainloop); g_main_loop_unref(mainloop); clean_up(0); return 0; /* never reached */ } sbd-1.5.1/src/sbd-common.c000066400000000000000000000763441414464774600153160ustar00rootroot00000000000000/* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include "sbd.h" #include #include #ifdef __GLIBC__ #include #endif #include #include #include #include #include #include #include #ifdef _POSIX_MEMLOCK # include #endif /* Tunable defaults: */ unsigned long timeout_watchdog = SBD_WATCHDOG_TIMEOUT_DEFAULT; int timeout_msgwait = 2 * SBD_WATCHDOG_TIMEOUT_DEFAULT; unsigned long timeout_watchdog_warn = calculate_timeout_watchdog_warn(SBD_WATCHDOG_TIMEOUT_DEFAULT); bool do_calculate_timeout_watchdog_warn = true; int timeout_allocate = 2; int timeout_loop = 1; int timeout_io = 3; int timeout_startup = 120; int watchdog_use = 1; int watchdog_set_timeout = 1; unsigned long timeout_watchdog_crashdump = 0; int skip_rt = 0; int debug = 0; int debug_mode = 0; char *watchdogdev = NULL; bool watchdogdev_is_default = false; char * local_uname; /* Global, non-tunable variables: */ int sector_size = 0; int watchdogfd = -1; int servant_health = 0; /*const char *devname;*/ const char *cmdname; void usage(void) { fprintf(stderr, "Shared storage fencing tool.\n" "Syntax:\n" " %s \n" "Options:\n" "-d Block device to use (mandatory; can be specified up to 3 times)\n" "-h Display this help.\n" "-n Set local node name; defaults to uname -n (optional)\n" "\n" "-R Do NOT enable realtime priority (debugging only)\n" "-W Use watchdog (recommended) (watch only)\n" "-w Specify watchdog device (optional) (watch only)\n" "-T Do NOT initialize the watchdog timeout (watch only)\n" "-S <0|1> Set start mode if the node was previously fenced (watch only)\n" "-p Write pidfile to the specified path (watch only)\n" "-v|-vv|-vvv Enable verbose|debug|debug-library logging (optional)\n" "\n" "-1 Set watchdog timeout to N seconds (optional, create only)\n" "-2 Set slot allocation timeout to N seconds (optional, create only)\n" "-3 Set daemon loop timeout to N seconds (optional, create only)\n" "-4 Set msgwait timeout to N seconds (optional, create only)\n" "-5 Warn if loop latency exceeds threshold (optional, watch only)\n" " (default is 3, set to 0 to disable)\n" "-C Watchdog timeout to set before crashdumping\n" " (def: 0s = disable gracefully, optional)\n" "-I Async IO read timeout (defaults to 3 * loop timeout, optional)\n" "-s Timeout to wait for devices to become available (def: 120s)\n" "-t Dampening delay before faulty servants are restarted (optional)\n" " (default is 5, set to 0 to disable)\n" "-F # of failures before a servant is considered faulty (optional)\n" " (default is 1, set to 0 to disable)\n" "-P Check Pacemaker quorum and node health (optional, watch only)\n" "-Z Enable trace mode. WARNING: UNSAFE FOR PRODUCTION!\n" "-r Set timeout-action to comma-separated combination of\n" " noflush|flush plus reboot|crashdump|off (default is flush,reboot)\n" "Commands:\n" #if SUPPORT_SHARED_DISK "create initialize N slots on - OVERWRITES DEVICE!\n" "list List all allocated slots on device, and messages.\n" "dump Dump meta-data header from device.\n" "allocate \n" " Allocate a slot for node (optional)\n" "message (test|reset|off|crashdump|clear|exit)\n" " Writes the specified message to node's slot.\n" #endif "watch Loop forever, monitoring own slot\n" "query-watchdog Check for available watchdog-devices and print some info\n" "test-watchdog Test the watchdog-device selected.\n" " Attention: This will arm the watchdog and have your system reset\n" " in case your watchdog is working properly!\n" , cmdname); } static int watchdog_init_interval_fd(int wdfd, int timeout) { if (ioctl(wdfd, WDIOC_SETTIMEOUT, &timeout) < 0) { cl_perror( "WDIOC_SETTIMEOUT" ": Failed to set watchdog timer to %u seconds.", timeout); cl_log(LOG_CRIT, "Please validate your watchdog configuration!"); cl_log(LOG_CRIT, "Choose a different watchdog driver or specify -T to skip this if you are completely sure."); return -1; } return 0; } int watchdog_init_interval(void) { if (watchdogfd < 0) { return 0; } if (watchdog_set_timeout == 0) { cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!"); return 0; } if (watchdog_init_interval_fd(watchdogfd, timeout_watchdog) < 0) { return -1; } cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog); return 0; } static int watchdog_tickle_fd(int wdfd, char *wddev) { if (write(wdfd, "", 1) != 1) { cl_perror("Watchdog write failure: %s!", wddev); return -1; } return 0; } int watchdog_tickle(void) { if (watchdogfd >= 0) { return watchdog_tickle_fd(watchdogfd, watchdogdev); } return 0; } static int watchdog_init_fd(char *wddev, int timeout) { int wdfd; wdfd = open(wddev, O_WRONLY); if (wdfd >= 0) { if (((timeout >= 0) && (watchdog_init_interval_fd(wdfd, timeout) < 0)) || (watchdog_tickle_fd(wdfd, wddev) < 0)) { close(wdfd); return -1; } } else { cl_perror("Cannot open watchdog device '%s'", wddev); return -1; } return wdfd; } int watchdog_init(void) { if (watchdogfd < 0 && watchdogdev != NULL) { int timeout = timeout_watchdog; if (watchdog_set_timeout == 0) { cl_log(LOG_INFO, "NOT setting watchdog timeout on explicit user request!"); timeout = -1; } watchdogfd = watchdog_init_fd(watchdogdev, timeout); if (watchdogfd >= 0) { cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); if (watchdog_set_timeout) { cl_log(LOG_INFO, "Set watchdog timeout to %u seconds.", (int) timeout_watchdog); } } else { return -1; } } return 0; } static void watchdog_close_fd(int wdfd, char *wddev, bool disarm) { if (disarm) { int r; int flags = WDIOS_DISABLECARD;; /* Explicitly disarm it */ r = ioctl(wdfd, WDIOC_SETOPTIONS, &flags); if (r < 0) { cl_perror("Failed to disable hardware watchdog %s", wddev); } /* To be sure, use magic close logic, too */ for (;;) { if (write(wdfd, "V", 1) > 0) { break; } cl_perror("Cannot disable watchdog device %s", wddev); } } if (close(wdfd) < 0) { cl_perror("Watchdog close(%d) failed", wdfd); } } void watchdog_close(bool disarm) { if (watchdogfd < 0) { return; } watchdog_close_fd(watchdogfd, watchdogdev, disarm); watchdogfd = -1; } #define MAX_WATCHDOGS 64 #define SYS_CLASS_WATCHDOG "/sys/class/watchdog" #define SYS_CHAR_DEV_DIR "/sys/dev/char" #define WATCHDOG_NODEDIR "/dev/" #define WATCHDOG_NODEDIR_LEN 5 struct watchdog_list_item { dev_t dev; char *dev_node; char *dev_ident; char *dev_driver; struct watchdog_list_item *next; }; struct link_list_item { char *dev_node; char *link_name; struct link_list_item *next; }; static struct watchdog_list_item *watchdog_list = NULL; static int watchdog_list_items = 0; static void watchdog_populate_list(void) { dev_t watchdogs[MAX_WATCHDOGS + 1] = {makedev(10,130), 0}; int num_watchdogs = 1; struct dirent *entry; char entry_name[280]; DIR *dp; char buf[280] = ""; struct link_list_item *link_list = NULL; if (watchdog_list != NULL) { return; } /* get additional devices from /sys/class/watchdog */ dp = opendir(SYS_CLASS_WATCHDOG); if (dp) { while ((entry = readdir(dp))) { if (entry->d_type == DT_LNK) { FILE *file; snprintf(entry_name, sizeof(entry_name), SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name); file = fopen(entry_name, "r"); if (file) { int major, minor; if (fscanf(file, "%d:%d", &major, &minor) == 2) { watchdogs[num_watchdogs++] = makedev(major, minor); } fclose(file); if (num_watchdogs == MAX_WATCHDOGS) { break; } } } } closedir(dp); } /* search for watchdog nodes in /dev */ dp = opendir(WATCHDOG_NODEDIR); if (dp) { /* first go for links and memorize them */ while ((entry = readdir(dp))) { if (entry->d_type == DT_LNK) { int len; snprintf(entry_name, sizeof(entry_name), WATCHDOG_NODEDIR "%s", entry->d_name); /* !realpath(entry_name, buf) unfortunately does a stat on * target so we can't really use it to check if links stay * within /dev without triggering e.g. AVC-logs (with * SELinux policy that just allows stat within /dev). * Without canonicalization that doesn't actually touch the * filesystem easily available introduce some limitations * for simplicity: * - just simple path without '..' * - just one level of symlinks (avoid e.g. loop-checking) */ len = readlink(entry_name, buf, sizeof(buf) - 1); if ((len < 1) || (len > sizeof(buf) - WATCHDOG_NODEDIR_LEN - 1)) { continue; } buf[len] = '\0'; if (buf[0] != '/') { memmove(&buf[WATCHDOG_NODEDIR_LEN], buf, len+1); memcpy(buf, WATCHDOG_NODEDIR, WATCHDOG_NODEDIR_LEN); len += WATCHDOG_NODEDIR_LEN; } if (strstr(buf, "/../") || strncmp(WATCHDOG_NODEDIR, buf, WATCHDOG_NODEDIR_LEN)) { continue; } else { /* just memorize to avoid statting the target - SELinux */ struct link_list_item *lli = calloc(1, sizeof(struct link_list_item)); lli->dev_node = strdup(buf); lli->link_name = strdup(entry_name); lli->next = link_list; link_list = lli; } } } rewinddir(dp); while ((entry = readdir(dp))) { if (entry->d_type == DT_CHR) { struct stat statbuf; snprintf(entry_name, sizeof(entry_name), WATCHDOG_NODEDIR "%s", entry->d_name); if(!stat(entry_name, &statbuf) && S_ISCHR(statbuf.st_mode)) { int i; for (i=0; idev = watchdogs[i]; wdg->dev_node = strdup(entry_name); wdg->next = watchdog_list; watchdog_list = wdg; watchdog_list_items++; if (wdfd >= 0) { struct watchdog_info ident; ident.identity[0] = '\0'; ioctl(wdfd, WDIOC_GETSUPPORT, &ident); watchdog_close_fd(wdfd, entry_name, true); if (ident.identity[0]) { wdg->dev_ident = strdup((char *) ident.identity); } } snprintf(entry_name, sizeof(entry_name), SYS_CHAR_DEV_DIR "/%d:%d/device/driver", major(watchdogs[i]), minor(watchdogs[i])); len = readlink(entry_name, buf, sizeof(buf) - 1); if (len > 0) { buf[len] = '\0'; wdg->dev_driver = strdup(basename(buf)); } else if ((wdg->dev_ident) && (strcmp(wdg->dev_ident, "Software Watchdog") == 0)) { wdg->dev_driver = strdup("softdog"); } /* create dupes if we have memorized links * to this node */ for (tmp_list = link_list; tmp_list; tmp_list = tmp_list->next) { if (!strcmp(tmp_list->dev_node, wdg->dev_node)) { struct watchdog_list_item *dupe_wdg = calloc(1, sizeof(struct watchdog_list_item)); /* as long as we never purge watchdog_list * there is no need to dupe strings */ *dupe_wdg = *wdg; dupe_wdg->dev_node = strdup(tmp_list->link_name); dupe_wdg->next = watchdog_list; watchdog_list = dupe_wdg; watchdog_list_items++; } /* for performance reasons we could remove * the link_list entry */ } break; } } } } } closedir(dp); } /* cleanup link list */ while (link_list) { struct link_list_item *tmp_list = link_list; link_list = link_list->next; free(tmp_list->dev_node); free(tmp_list->link_name); free(tmp_list); } } int watchdog_info(void) { struct watchdog_list_item *wdg; int wdg_cnt = 0; watchdog_populate_list(); printf("\nDiscovered %d watchdog devices:\n", watchdog_list_items); for (wdg = watchdog_list; wdg != NULL; wdg = wdg->next) { wdg_cnt++; printf("\n[%d] %s\nIdentity: %s\nDriver: %s\n", wdg_cnt, wdg->dev_node, wdg->dev_ident?wdg->dev_ident:"Error: Check if hogged by e.g. sbd-daemon!", wdg->dev_driver?wdg->dev_driver:""); if ((wdg->dev_driver) && (strcmp(wdg->dev_driver, "softdog") == 0)) { printf("CAUTION: Not recommended for use with sbd.\n"); } } return 0; } int watchdog_test(void) { int i; if ((watchdog_set_timeout == 0) || !watchdog_use) { printf("\nWatchdog is disabled - aborting test!!!\n"); return 0; } if (watchdogdev_is_default) { watchdog_populate_list(); if (watchdog_list_items > 1) { printf("\nError: Multiple watchdog devices discovered.\n" " Use -w or SBD_WATCHDOG_DEV to specify\n" " which device to reset the system with\n"); watchdog_info(); return -1; } } if ((isatty(fileno(stdin)))) { char buffer[16]; printf("\nWARNING: This operation is expected to force-reboot this system\n" " without following any shutdown procedures.\n\n" "Proceed? [NO/Proceed] "); if ((fgets(buffer, 16, stdin) == NULL) || strcmp(buffer, "Proceed\n")) { printf("\nAborting watchdog test!!!\n"); return 0; } printf("\n"); } printf("Initializing %s with a reset countdown of %d seconds ...\n", watchdogdev, (int) timeout_watchdog); if ((watchdog_init() < 0) || (watchdog_init_interval() < 0)) { printf("Failed to initialize watchdog!!!\n"); return -1; } printf("\n"); printf("NOTICE: The watchdog device is expected to reset the system\n" " in %d seconds. If system remains active beyond that time,\n" " watchdog may not be functional.\n\n", (int) timeout_watchdog); for (i=timeout_watchdog; i>1; i--) { printf("Reset countdown ... %d seconds\n", i); sleep(1); } for (i=2; i>0; i--) { printf("System expected to reset any moment ...\n"); sleep(1); } for (i=5; i>0; i--) { printf("System should have reset ...\n"); sleep(1); } printf("Error: The watchdog device has failed to reboot the system,\n" " and it may not be suitable for usage with sbd.\n"); /* test should trigger a reboot thus returning is actually bad */ return -1; } /* This duplicates some code from linux/ioprio.h since these are not included * even in linux-kernel-headers. Sucks. See also * /usr/src/linux/Documentation/block/ioprio.txt and ioprio_set(2) */ extern int sys_ioprio_set(int, int, int); int ioprio_set(int which, int who, int ioprio); inline int ioprio_set(int which, int who, int ioprio) { return syscall(__NR_ioprio_set, which, who, ioprio); } enum { IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE, }; enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, }; #define IOPRIO_BITS (16) #define IOPRIO_CLASS_SHIFT (13) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) #define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) #define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) static void sbd_stack_hogger(unsigned char * inbuf, int kbytes) { unsigned char buf[1024]; if(kbytes <= 0) { return; } if (inbuf == NULL) { memset(buf, HOG_CHAR, sizeof(buf)); } else { memcpy(buf, inbuf, sizeof(buf)); } if (kbytes > 0) { sbd_stack_hogger(buf, kbytes-1); } return; } static void sbd_malloc_hogger(int kbytes) { int j; void**chunks; int chunksize = 1024; if(kbytes <= 0) { return; } /* * We could call mallopt(M_MMAP_MAX, 0) to disable it completely, * but we've already called mlockall() * * We could also call mallopt(M_TRIM_THRESHOLD, -1) to prevent malloc * from giving memory back to the system, but we've already called * mlockall(MCL_FUTURE), so there's no need. */ chunks = malloc(kbytes * sizeof(void *)); if (chunks == NULL) { cl_log(LOG_WARNING, "Could not preallocate chunk array"); return; } for (j=0; j < kbytes; ++j) { chunks[j] = malloc(chunksize); if (chunks[j] == NULL) { cl_log(LOG_WARNING, "Could not preallocate block %d", j); } else { memset(chunks[j], 0, chunksize); } } for (j=0; j < kbytes; ++j) { free(chunks[j]); } free(chunks); } static void sbd_memlock(int stackgrowK, int heapgrowK) { #ifdef _POSIX_MEMLOCK /* * We could call setrlimit(RLIMIT_MEMLOCK,...) with a large * number, but the mcp runs as root and mlock(2) says: * * Since Linux 2.6.9, no limits are placed on the amount of memory * that a privileged process may lock, and this limit instead * governs the amount of memory that an unprivileged process may * lock. */ if (mlockall(MCL_CURRENT|MCL_FUTURE) >= 0) { cl_log(LOG_INFO, "Locked ourselves in memory"); /* Now allocate some extra pages (MCL_FUTURE will ensure they stay around) */ sbd_malloc_hogger(heapgrowK); sbd_stack_hogger(NULL, stackgrowK); } else { cl_perror("Unable to lock ourselves into memory"); } #else cl_log(LOG_ERR, "Unable to lock ourselves into memory"); #endif } static int get_realtime_budget(void) { FILE *f; char fname[PATH_MAX]; int res = -1, lnum = 0, num; char *cgroup = NULL, *namespecs = NULL; snprintf(fname, PATH_MAX, "/proc/%jd/cgroup", (intmax_t)getpid()); f = fopen(fname, "rt"); if (f == NULL) { cl_log(LOG_WARNING, "Can't open cgroup file for pid=%jd", (intmax_t)getpid()); goto exit_res; } while( (num = fscanf(f, "%d:%m[^:]:%m[^\n]\n", &lnum, &namespecs, &cgroup)) !=EOF ) { if (namespecs && strstr(namespecs, "cpuacct")) { free(namespecs); break; } if (cgroup) { free(cgroup); cgroup = NULL; } if (namespecs) { free(namespecs); namespecs = NULL; } /* not to get stuck if format changes */ if ((num < 3) && ((fscanf(f, "%*[^\n]") == EOF) || (fscanf(f, "\n") == EOF))) { break; } } fclose(f); if (cgroup == NULL) { cl_log(LOG_WARNING, "Failed getting cgroup for pid=%jd", (intmax_t)getpid()); goto exit_res; } snprintf(fname, PATH_MAX, "/sys/fs/cgroup/cpu%s/cpu.rt_runtime_us", cgroup); f = fopen(fname, "rt"); if (f == NULL) { cl_log(LOG_WARNING, "cpu.rt_runtime_us existed for root-slice but " "doesn't for '%s'", cgroup); goto exit_res; } if (fscanf(f, "%d", &res) != 1) { cl_log(LOG_WARNING, "failed reading rt-budget from %s", fname); } else { cl_log(LOG_INFO, "slice='%s' has rt-budget=%d", cgroup, res); } fclose(f); exit_res: if (cgroup) { free(cgroup); } return res; } /* stolen from corosync */ static int sbd_move_to_root_cgroup(bool enforce_root_cgroup) { FILE *f; int res = -1; /* * /sys/fs/cgroup is hardcoded, because most of Linux distributions are now * using systemd and systemd uses hardcoded path of cgroup mount point. * * This feature is expected to be removed as soon as systemd gets support * for managing RT configuration. */ f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt"); if (f == NULL) { cl_log(LOG_DEBUG, "cpu.rt_runtime_us doesn't exist -> " "system without cgroup or with disabled CONFIG_RT_GROUP_SCHED"); res = 0; goto exit_res; } fclose(f); if ((!enforce_root_cgroup) && (get_realtime_budget() > 0)) { cl_log(LOG_DEBUG, "looks as if we have rt-budget in the slice we are " "-> skip moving to root-slice"); res = 0; goto exit_res; } f = fopen("/sys/fs/cgroup/cpu/tasks", "w"); if (f == NULL) { cl_log(LOG_WARNING, "Can't open cgroups tasks file for writing"); goto exit_res; } if (fprintf(f, "%jd\n", (intmax_t)getpid()) <= 0) { cl_log(LOG_WARNING, "Can't write sbd pid into cgroups tasks file"); goto close_and_exit_res; } close_and_exit_res: if (fclose(f) != 0) { cl_log(LOG_WARNING, "Can't close cgroups tasks file"); goto exit_res; } exit_res: return (res); } void sbd_make_realtime(int priority, int stackgrowK, int heapgrowK) { if(priority < 0) { return; } do { #ifdef SCHED_RR if (move_to_root_cgroup) { sbd_move_to_root_cgroup(enforce_moving_to_root_cgroup); } { int pmin = sched_get_priority_min(SCHED_RR); int pmax = sched_get_priority_max(SCHED_RR); struct sched_param sp; int pcurrent; if (priority == 0) { priority = pmax; } else if (priority < pmin) { priority = pmin; } else if (priority > pmax) { priority = pmax; } if (sched_getparam(0, &sp) < 0) { cl_perror("Unable to get scheduler priority"); } else if ((pcurrent = sched_getscheduler(0)) < 0) { cl_perror("Unable to get scheduler policy"); } else if ((pcurrent == SCHED_RR) && (sp.sched_priority >= priority)) { cl_log(LOG_INFO, "Stay with priority (%d) for policy SCHED_RR", sp.sched_priority); break; } else { memset(&sp, 0, sizeof(sp)); sp.sched_priority = priority; if (sched_setscheduler(0, SCHED_RR, &sp) < 0) { cl_perror( "Unable to set scheduler policy to SCHED_RR priority %d", priority); } else { cl_log(LOG_INFO, "Scheduler policy is now SCHED_RR priority %d", priority); break; } } } #else cl_log(LOG_ERR, "System does not support updating the scheduler policy"); #endif #ifdef PRIO_PGRP if (setpriority(PRIO_PGRP, 0, INT_MIN) < 0) { cl_perror("Unable to raise the scheduler priority"); } else { cl_log(LOG_INFO, "Scheduler priority raised to the maximum"); } #else cl_perror("System does not support setting the scheduler priority"); #endif } while (0); sbd_memlock(heapgrowK, stackgrowK); } void maximize_priority(void) { if (skip_rt) { cl_log(LOG_INFO, "Not elevating to realtime (-R specified)."); return; } sbd_make_realtime(0, 256, 256); if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) { cl_perror("ioprio_set() call failed."); } } void sysrq_init(void) { FILE* procf; int c; procf = fopen("/proc/sys/kernel/sysrq", "r"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for read."); return; } if (fscanf(procf, "%d", &c) != 1) { cl_perror("Parsing sysrq failed"); c = 0; } fclose(procf); if (c == 1) return; /* 8 for debugging dumps of processes, 128 for reboot/poweroff */ c |= 136; procf = fopen("/proc/sys/kernel/sysrq", "w"); if (!procf) { cl_perror("cannot open /proc/sys/kernel/sysrq for writing"); return; } fprintf(procf, "%d", c); fclose(procf); return; } void sysrq_trigger(char t) { FILE *procf; procf = fopen("/proc/sysrq-trigger", "a"); if (!procf) { cl_perror("Opening sysrq-trigger failed."); return; } cl_log(LOG_INFO, "sysrq-trigger: %c\n", t); fprintf(procf, "%c\n", t); fclose(procf); return; } static void do_exit(char kind, bool do_flush) { /* TODO: Turn debug_mode into a bit field? Delay + kdump for example */ const char *reason = NULL; if (kind == 'c') { cl_log(LOG_NOTICE, "Initiating kdump"); } else if (debug_mode == 1) { cl_log(LOG_WARNING, "Initiating kdump instead of panicking the node (debug mode)"); kind = 'c'; } if (debug_mode == 2) { cl_log(LOG_WARNING, "Shutting down SBD instead of panicking the node (debug mode)"); watchdog_close(true); exit(0); } if (debug_mode == 3) { /* Give the system some time to flush logs to disk before rebooting. */ cl_log(LOG_WARNING, "Delaying node panic by 10s (debug mode)"); watchdog_close(true); sync(); sleep(10); } switch(kind) { case 'b': reason = "reboot"; break; case 'c': reason = "crashdump"; break; case 'o': reason = "off"; break; default: reason = "unknown"; break; } cl_log(LOG_EMERG, "Rebooting system: %s", reason); if (do_flush) { sync(); } if (kind == 'c') { if (timeout_watchdog_crashdump) { if (timeout_watchdog != timeout_watchdog_crashdump) { timeout_watchdog = timeout_watchdog_crashdump; watchdog_init_interval(); } watchdog_close(false); } else { watchdog_close(true); } sysrq_trigger(kind); } else { watchdog_close(false); sysrq_trigger(kind); if (reboot((kind == 'o')?RB_POWER_OFF:RB_AUTOBOOT) < 0) { cl_perror("%s failed", (kind == 'o')?"Poweroff":"Reboot"); } } exit(1); } void do_crashdump(void) { do_exit('c', true); } void do_reset(void) { do_exit('b', true); } void do_off(void) { do_exit('o', true); } void do_timeout_action(void) { do_exit(timeout_sysrq_char, do_flush); } /* * Change directory to the directory our core file needs to go in * Call after you establish the userid you're running under. */ int sbd_cdtocoredir(void) { int rc; static const char *dir = NULL; if (dir == NULL) { dir = CRM_CORE_DIR; } if ((rc=chdir(dir)) < 0) { int errsave = errno; cl_perror("Cannot chdir to [%s]", dir); errno = errsave; } return rc; } pid_t make_daemon(void) { pid_t pid; const char * devnull = "/dev/null"; pid = fork(); if (pid < 0) { cl_log(LOG_ERR, "%s: could not start daemon\n", cmdname); cl_perror("fork"); exit(1); }else if (pid > 0) { return pid; } qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); /* This is the child; ensure privileges have not been lost. */ maximize_priority(); sysrq_init(); umask(022); close(0); (void)open(devnull, O_RDONLY); close(1); (void)open(devnull, O_WRONLY); close(2); (void)open(devnull, O_WRONLY); sbd_cdtocoredir(); return 0; } void sbd_get_uname(void) { struct utsname uname_buf; int i; if (uname(&uname_buf) < 0) { cl_perror("uname() failed?"); exit(1); } local_uname = strdup(uname_buf.nodename); for (i = 0; i < strlen(local_uname); i++) local_uname[i] = tolower(local_uname[i]); } #define FMT_MAX 256 void sbd_set_format_string(int method, const char *daemon) { int offset = 0; char fmt[FMT_MAX]; struct utsname res; switch(method) { case QB_LOG_STDERR: break; case QB_LOG_SYSLOG: if(daemon && strcmp(daemon, "sbd") != 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%10s: ", daemon); } break; default: /* When logging to a file */ if (uname(&res) == 0) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %s %10s: ", getpid(), res.nodename, daemon); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%t [%d] %10s: ", getpid(), daemon); } } if (debug && method >= QB_LOG_STDERR) { offset += snprintf(fmt + offset, FMT_MAX - offset, "(%%-12f:%%5l %%g) %%-7p: %%n: "); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%g %%-7p: %%n: "); } if (method == QB_LOG_SYSLOG) { offset += snprintf(fmt + offset, FMT_MAX - offset, "%%b"); } else { offset += snprintf(fmt + offset, FMT_MAX - offset, "\t%%b"); } if(offset > 0) { qb_log_format_set(method, fmt); } } void notify_parent(void) { pid_t ppid; union sigval signal_value; memset(&signal_value, 0, sizeof(signal_value)); ppid = getppid(); if (ppid == 1) { /* Our parent died unexpectedly. Triggering * self-fence. */ cl_log(LOG_WARNING, "Our parent is dead."); do_timeout_action(); } switch (servant_health) { case pcmk_health_pending: case pcmk_health_shutdown: case pcmk_health_transient: DBGLOG(LOG_DEBUG, "Not notifying parent: state transient (%d)", servant_health); break; case pcmk_health_unknown: case pcmk_health_unclean: case pcmk_health_noquorum: DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY (%d)", servant_health); sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); break; case pcmk_health_online: DBGLOG(LOG_DEBUG, "Notifying parent: healthy"); sigqueue(ppid, SIG_LIVENESS, signal_value); break; default: DBGLOG(LOG_WARNING, "Notifying parent: UNHEALTHY %d", servant_health); sigqueue(ppid, SIG_PCMK_UNHEALTHY, signal_value); break; } } void set_servant_health(enum pcmk_health state, int level, char const *format, ...) { if (servant_health != state) { va_list ap; int len = 0; char *string = NULL; servant_health = state; va_start(ap, format); len = vasprintf (&string, format, ap); if(len > 0) { cl_log(level, "%s", string); } va_end(ap); free(string); } } bool sbd_is_disk(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (servant->devname[0] == '/')) { return true; } return false; } bool sbd_is_cluster(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (strcmp("cluster", servant->devname) == 0)) { return true; } return false; } bool sbd_is_pcmk(struct servants_list_item *servant) { if ((servant != NULL) && (servant->devname != NULL) && (strcmp("pcmk", servant->devname) == 0)) { return true; } return false; } sbd-1.5.1/src/sbd-inquisitor.c000066400000000000000000001115751414464774600162300ustar00rootroot00000000000000/* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include "sbd.h" #define LOCKSTRLEN 11 static struct servants_list_item *servants_leader = NULL; int disk_priority = 1; int check_pcmk = 1; int check_cluster = 1; int disk_count = 0; int servant_count = 0; int servant_restart_interval = 5; int servant_restart_count = 1; int start_mode = 0; char* pidfile = NULL; bool do_flush = true; char timeout_sysrq_char = 'b'; bool move_to_root_cgroup = true; bool enforce_moving_to_root_cgroup = false; bool sync_resource_startup = false; int parse_device_line(const char *line); static int sanitize_numeric_option_value(const char *value) { char *end = NULL; long int result = -1; if (value == NULL) { return -1; } errno = 0; result = strtol(value, &end, 10); if (result <= INT_MIN || result >= INT_MAX || errno != 0) { result = -1; } else if (*end != '\0') { result = -1; } return (int)result; } static const char * sanitize_option_value(const char *value) { size_t max = 0; size_t lpc = 0; if (value == NULL) { return NULL; } max = strlen(value); for (lpc = 0; lpc < max; lpc++) { if (!isspace(value[lpc])) { break; } } return (strlen(value + lpc) > 0 ? (value + lpc) : NULL); } static const char * get_env_option(const char *option) { const char *value = getenv(option); return sanitize_option_value(value); } static int recruit_servant(const char *devname, pid_t pid) { struct servants_list_item *s = servants_leader; struct servants_list_item *newbie; if (lookup_servant_by_dev(devname)) { cl_log(LOG_DEBUG, "Servant %s already exists", devname); return 0; } newbie = malloc(sizeof(*newbie)); if (newbie) { memset(newbie, 0, sizeof(*newbie)); newbie->devname = strdup(devname); newbie->pid = pid; newbie->first_start = 1; } if (!newbie || !newbie->devname) { fprintf(stderr, "heap allocation failed in recruit_servant.\n"); exit(1); } /* some sanity-check on our newbie */ if (sbd_is_disk(newbie)) { cl_log(LOG_INFO, "Monitoring %s", devname); disk_count++; } else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) { /* alive just after pcmk and cluster servants have shown up */ newbie->outdated = 1; } else { /* toss our newbie */ cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname); free((void *) newbie->devname); free(newbie); return -1; } if (!s) { servants_leader = newbie; } else { while (s->next) s = s->next; s->next = newbie; } servant_count++; return 0; } int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp) { pid_t pid = 0; int rc = 0; pid = fork(); if (pid == 0) { /* child */ maximize_priority(); sbd_set_format_string(QB_LOG_SYSLOG, devname); rc = (*functionp)(devname, mode, argp); if (rc == -1) exit(1); else exit(0); } else if (pid != -1) { /* parent */ return pid; } else { cl_log(LOG_ERR,"Failed to fork servant"); exit(1); } } struct servants_list_item *lookup_servant_by_dev(const char *devname) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (strcasecmp(s->devname, devname) == 0) break; } return s; } struct servants_list_item *lookup_servant_by_pid(pid_t pid) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { if (s->pid == pid) break; } return s; } int check_all_dead(void) { struct servants_list_item *s; int r = 0; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if (r == -1 && errno == ESRCH) continue; return 0; } } return 1; } void servant_start(struct servants_list_item *s) { int r = 0; union sigval svalue; if (s->pid != 0) { r = sigqueue(s->pid, 0, svalue); if ((r != -1 || errno != ESRCH)) return; } s->restarts++; if (sbd_is_disk(s)) { #if SUPPORT_SHARED_DISK DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname); s->pid = assign_servant(s->devname, servant_md, start_mode, s); #else cl_log(LOG_ERR, "Shared disk functionality not supported"); return; #endif } else if(sbd_is_pcmk(s)) { DBGLOG(LOG_INFO, "Starting Pacemaker servant"); s->pid = assign_servant(s->devname, servant_pcmk, start_mode, NULL); } else if(sbd_is_cluster(s)) { DBGLOG(LOG_INFO, "Starting Cluster servant"); s->pid = assign_servant(s->devname, servant_cluster, start_mode, NULL); } else { cl_log(LOG_ERR, "Unrecognized servant: %s", s->devname); } clock_gettime(CLOCK_MONOTONIC, &s->t_started); return; } void servants_start(void) { struct servants_list_item *s; for (s = servants_leader; s; s = s->next) { s->restarts = 0; servant_start(s); } } void servants_kill(void) { struct servants_list_item *s; union sigval svalue; for (s = servants_leader; s; s = s->next) { if (s->pid != 0) sigqueue(s->pid, SIGKILL, svalue); } } static inline void cleanup_servant_by_pid(pid_t pid) { struct servants_list_item* s; s = lookup_servant_by_pid(pid); if (s) { cl_log(LOG_WARNING, "Servant for %s (pid: %i) has terminated", s->devname, s->pid); s->pid = 0; } else { /* This most likely is a stray signal from somewhere, or * a SIGCHLD for a process that has previously * explicitly disconnected. */ DBGLOG(LOG_INFO, "cleanup_servant: Nothing known about pid %i", pid); } } int inquisitor_decouple(void) { pid_t ppid = getppid(); union sigval signal_value; /* During start-up, we only arm the watchdog once we've got * quorum at least once. */ if (watchdog_use) { if (watchdog_init() < 0) { return -1; } } if (ppid > 1) { sigqueue(ppid, SIG_LIVENESS, signal_value); } return 0; } static int sbd_lock_running(long pid) { int rc = 0; long mypid; int running = 0; char proc_path[PATH_MAX], exe_path[PATH_MAX], myexe_path[PATH_MAX]; /* check if pid is running */ if (kill(pid, 0) < 0 && errno == ESRCH) { goto bail; } #ifndef HAVE_PROC_PID return 1; #endif /* check to make sure pid hasn't been reused by another process */ snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", pid); rc = readlink(proc_path, exe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } exe_path[rc] = 0; mypid = (unsigned long) getpid(); snprintf(proc_path, sizeof(proc_path), "/proc/%lu/exe", mypid); rc = readlink(proc_path, myexe_path, PATH_MAX-1); if(rc < 0) { cl_perror("Could not read from %s", proc_path); goto bail; } myexe_path[rc] = 0; if(strcmp(exe_path, myexe_path) == 0) { running = 1; } bail: return running; } static int sbd_lock_pidfile(const char *filename) { char lf_name[256], tf_name[256], buf[LOCKSTRLEN+1]; int fd; long pid, mypid; int rc; struct stat sbuf; if (filename == NULL) { errno = EFAULT; return -1; } mypid = (unsigned long) getpid(); snprintf(lf_name, sizeof(lf_name), "%s",filename); snprintf(tf_name, sizeof(tf_name), "%s.%lu", filename, mypid); if ((fd = open(lf_name, O_RDONLY)) >= 0) { if (fstat(fd, &sbuf) >= 0 && sbuf.st_size < LOCKSTRLEN) { sleep(1); /* if someone was about to create one, * give'm a sec to do so * Though if they follow our protocol, * this won't happen. They should really * put the pid in, then link, not the * other way around. */ } if (read(fd, buf, sizeof(buf)) < 1) { /* lockfile empty -> rm it and go on */; } else { if (sscanf(buf, "%ld", &pid) < 1) { /* lockfile screwed up -> rm it and go on */ } else { if (pid > 1 && (getpid() != pid) && sbd_lock_running(pid)) { /* is locked by existing process * -> give up */ close(fd); return -1; } else { /* stale lockfile -> rm it and go on */ } } } unlink(lf_name); close(fd); } if ((fd = open(tf_name, O_CREAT | O_WRONLY | O_EXCL, 0644)) < 0) { /* Hmmh, why did we fail? Anyway, nothing we can do about it */ return -3; } /* Slight overkill with the %*d format ;-) */ snprintf(buf, sizeof(buf), "%*lu\n", LOCKSTRLEN-1, mypid); if (write(fd, buf, LOCKSTRLEN) != LOCKSTRLEN) { /* Again, nothing we can do about this */ rc = -3; close(fd); goto out; } close(fd); switch (link(tf_name, lf_name)) { case 0: if (stat(tf_name, &sbuf) < 0) { /* something weird happened */ rc = -3; break; } if (sbuf.st_nlink < 2) { /* somehow, it didn't get through - NFS trouble? */ rc = -2; break; } rc = 0; break; case EEXIST: rc = -1; break; default: rc = -3; } out: unlink(tf_name); return rc; } /* * Unlock a file (remove its lockfile) * do we need to check, if its (still) ours? No, IMHO, if someone else * locked our line, it's his fault -tho * returns 0 on success * <0 if some failure occured */ static int sbd_unlock_pidfile(const char *filename) { char lf_name[256]; if (filename == NULL) { errno = EFAULT; return -1; } snprintf(lf_name, sizeof(lf_name), "%s", filename); return unlink(lf_name); } int cluster_alive(bool all) { int alive = 1; struct servants_list_item* s; if(servant_count == disk_count) { return 0; } for (s = servants_leader; s; s = s->next) { if (sbd_is_cluster(s) || sbd_is_pcmk(s)) { if(s->outdated) { alive = 0; } else if(all == false) { return 1; } } } return alive; } int quorum_read(int good_servants) { if (disk_count > 2) return (good_servants > disk_count/2); else return (good_servants > 0); } void inquisitor_child(void) { int sig, pid; sigset_t procmask; siginfo_t sinfo; int status; struct timespec timeout; int exiting = 0; int decoupled = 0; int cluster_appeared = 0; int pcmk_override = 0; time_t latency; struct timespec t_last_tickle, t_now; struct servants_list_item* s; if (debug_mode) { cl_log(LOG_ERR, "DEBUG MODE %d IS ACTIVE - DO NOT RUN IN PRODUCTION!", debug_mode); } set_proc_title("sbd: inquisitor"); if (pidfile) { if (sbd_lock_pidfile(pidfile) < 0) { exit(1); } } sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIGTERM); sigaddset(&procmask, SIG_LIVENESS); sigaddset(&procmask, SIG_EXITREQ); sigaddset(&procmask, SIG_TEST); sigaddset(&procmask, SIG_PCMK_UNHEALTHY); sigaddset(&procmask, SIG_RESTART); sigaddset(&procmask, SIGUSR1); sigaddset(&procmask, SIGUSR2); sigprocmask(SIG_BLOCK, &procmask, NULL); servants_start(); timeout.tv_sec = timeout_loop; timeout.tv_nsec = 0; clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); while (1) { bool tickle = 0; bool can_detach = 0; int good_servants = 0; sig = sigtimedwait(&procmask, &sinfo, &timeout); clock_gettime(CLOCK_MONOTONIC, &t_now); if (sig == SIG_EXITREQ || sig == SIGTERM) { servants_kill(); watchdog_close(true); exiting = 1; } else if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } else { s = lookup_servant_by_pid(pid); if (sbd_is_disk(s)) { if (WIFEXITED(status)) { switch(WEXITSTATUS(status)) { case EXIT_MD_SERVANT_IO_FAIL: DBGLOG(LOG_INFO, "Servant for %s requests to be disowned", s->devname); break; case EXIT_MD_SERVANT_REQUEST_RESET: cl_log(LOG_WARNING, "%s requested a reset", s->devname); do_reset(); break; case EXIT_MD_SERVANT_REQUEST_SHUTOFF: cl_log(LOG_WARNING, "%s requested a shutoff", s->devname); do_off(); break; case EXIT_MD_SERVANT_REQUEST_CRASHDUMP: cl_log(LOG_WARNING, "%s requested a crashdump", s->devname); do_crashdump(); break; default: break; } } } else if (sbd_is_pcmk(s)) { if (WIFEXITED(status)) { switch(WEXITSTATUS(status)) { case EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN: DBGLOG(LOG_INFO, "PCMK-Servant has exited gracefully"); /* revert to state prior to pacemaker-detection */ s->restarts = 0; s->restart_blocked = 0; cluster_appeared = 0; s->outdated = 1; s->t_last.tv_sec = 0; break; default: break; } } } cleanup_servant_by_pid(pid); } } } else if (sig == SIG_PCMK_UNHEALTHY) { s = lookup_servant_by_pid(sinfo.si_pid); if (sbd_is_cluster(s) || sbd_is_pcmk(s)) { if (s->outdated == 0) { cl_log(LOG_WARNING, "%s health check: UNHEALTHY", s->devname); } s->t_last.tv_sec = 1; } else { cl_log(LOG_WARNING, "Ignoring SIG_PCMK_UNHEALTHY from unknown source"); } } else if (sig == SIG_LIVENESS) { s = lookup_servant_by_pid(sinfo.si_pid); if (s) { s->first_start = 0; clock_gettime(CLOCK_MONOTONIC, &s->t_last); } } else if (sig == SIG_TEST) { } else if (sig == SIGUSR1) { if (exiting) continue; servants_start(); } if (exiting) { if (check_all_dead()) { if (pidfile) { sbd_unlock_pidfile(pidfile); } exit(0); } else continue; } good_servants = 0; for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_last.tv_sec; if (!s->t_last.tv_sec) continue; if (age < (int)(timeout_io+timeout_loop)) { if (sbd_is_disk(s)) { good_servants++; } if (s->outdated) { cl_log(LOG_NOTICE, "Servant %s is healthy (age: %d)", s->devname, age); } s->outdated = 0; } else if (!s->outdated) { if (!s->restart_blocked) { cl_log(LOG_WARNING, "Servant %s is outdated (age: %d)", s->devname, age); } s->outdated = 1; } } if(disk_count == 0) { /* NO disks, everything is up to the cluster */ if(cluster_alive(true)) { /* We LIVE! */ if(cluster_appeared == false) { cl_log(LOG_INFO, "Active cluster detected"); } tickle = 1; can_detach = 1; cluster_appeared = 1; } else if(cluster_alive(false)) { if(!decoupled) { /* On the way up, detach and arm the watchdog */ cl_log(LOG_INFO, "Partial cluster detected, detaching"); } can_detach = 1; tickle = !cluster_appeared; } else if(!decoupled) { /* Stay alive until the cluster comes up */ tickle = !cluster_appeared; } } else if(disk_priority == 1 || servant_count == disk_count) { if (quorum_read(good_servants)) { /* There are disks and we're connected to the majority of them */ tickle = 1; can_detach = 1; pcmk_override = 0; } else if (servant_count > disk_count && cluster_alive(true)) { tickle = 1; if(!pcmk_override) { cl_log(LOG_WARNING, "Majority of devices lost - surviving on pacemaker"); pcmk_override = 1; /* Only log this message once */ } } } else if(cluster_alive(true) && quorum_read(good_servants)) { /* Both disk and cluster servants are healthy */ tickle = 1; can_detach = 1; cluster_appeared = 1; } else if(quorum_read(good_servants)) { /* The cluster takes priority but only once * connected for the first time. * * Until then, we tickle based on disk quorum. */ can_detach = 1; tickle = !cluster_appeared; } /* cl_log(LOG_DEBUG, "Tickle: q=%d, g=%d, p=%d, s=%d", */ /* quorum_read(good_servants), good_servants, tickle, disk_count); */ if(tickle) { watchdog_tickle(); clock_gettime(CLOCK_MONOTONIC, &t_last_tickle); } if (!decoupled && can_detach) { /* We only do this at the point either the disk or * cluster servants become healthy */ cl_log(LOG_DEBUG, "Decoupling"); if (inquisitor_decouple() < 0) { servants_kill(); exiting = 1; continue; } else { decoupled = 1; } } /* Note that this can actually be negative, since we set * last_tickle after we set now. */ latency = t_now.tv_sec - t_last_tickle.tv_sec; if (timeout_watchdog && (latency > (int)timeout_watchdog)) { if (!decoupled) { /* We're still being watched by our * parent. We don't fence, but exit. */ cl_log(LOG_ERR, "SBD: Not enough votes to proceed. Aborting start-up."); servants_kill(); exiting = 1; continue; } if (debug_mode < 2) { /* At level 2 or above, we do nothing, but expect * things to eventually return to * normal. */ do_timeout_action(); } else { cl_log(LOG_ERR, "SBD: DEBUG MODE: Would have fenced due to timeout!"); } } if (timeout_watchdog_warn && (latency > (int)timeout_watchdog_warn)) { cl_log(LOG_WARNING, "Latency: No liveness for %ds exceeds watchdog warning timeout of %ds (healthy servants: %d)", (int)latency, (int)timeout_watchdog_warn, good_servants); if (debug_mode && watchdog_use) { /* In debug mode, trigger a reset before the watchdog can panic the machine */ do_timeout_action(); } } for (s = servants_leader; s; s = s->next) { int age = t_now.tv_sec - s->t_started.tv_sec; if (age > servant_restart_interval) { s->restarts = 0; s->restart_blocked = 0; } if (servant_restart_count && (s->restarts >= servant_restart_count) && !s->restart_blocked) { if (servant_restart_count > 1) { cl_log(LOG_WARNING, "Max retry count (%d) reached: not restarting servant for %s", (int)servant_restart_count, s->devname); } s->restart_blocked = 1; } if (!s->restart_blocked) { servant_start(s); } } } /* not reached */ exit(0); } int inquisitor(void) { int sig, pid, inquisitor_pid; int status; sigset_t procmask; siginfo_t sinfo; /* Where's the best place for sysrq init ?*/ sysrq_init(); sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigaddset(&procmask, SIG_LIVENESS); sigprocmask(SIG_BLOCK, &procmask, NULL); inquisitor_pid = make_daemon(); if (inquisitor_pid == 0) { inquisitor_child(); } /* We're the parent. Wait for a happy signal from our child * before we proceed - we either get "SIG_LIVENESS" when the * inquisitor has completed the first successful round, or * ECHLD when it exits with an error. */ while (1) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } /* We got here because the inquisitor * did not succeed. */ return -1; } } else if (sig == SIG_LIVENESS) { /* Inquisitor started up properly. */ return 0; } else { fprintf(stderr, "Nobody expected the spanish inquisition!\n"); continue; } } /* not reached */ return -1; } int parse_device_line(const char *line) { size_t lpc = 0; size_t last = 0; size_t max = 0; int found = 0; bool skip_space = true; int space_run = 0; if (!line) { return 0; } max = strlen(line); cl_log(LOG_DEBUG, "Processing %d bytes: [%s]", (int) max, line); for (lpc = 0; lpc <= max; lpc++) { if (isspace(line[lpc])) { if (skip_space) { last = lpc + 1; } else { space_run++; } continue; } skip_space = false; if (line[lpc] == ';' || line[lpc] == 0) { int rc = 0; char *entry = calloc(1, 1 + lpc - last); if (entry) { rc = sscanf(line + last, "%[^;]", entry); } else { fprintf(stderr, "Heap allocation failed parsing device-line.\n"); exit(1); } if (rc != 1) { cl_log(LOG_WARNING, "Could not parse: '%s'", line + last); } else { entry[strlen(entry)-space_run] = '\0'; cl_log(LOG_DEBUG, "Adding '%s'", entry); if (recruit_servant(entry, 0) != 0) { free(entry); // sbd should refuse to start if any of the configured device names is invalid. return -1; } found++; } free(entry); skip_space = true; last = lpc + 1; } space_run = 0; } return found; } #define SBD_SOURCE_FILES "sbd-cluster.c,sbd-common.c,sbd-inquisitor.c,sbd-md.c,sbd-pacemaker.c,setproctitle.c" static void sbd_log_filter_ctl(const char *files, uint8_t priority) { if (files == NULL) { files = SBD_SOURCE_FILES; } qb_log_filter_ctl(QB_LOG_SYSLOG, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority); qb_log_filter_ctl(QB_LOG_STDERR, QB_LOG_FILTER_ADD, QB_LOG_FILTER_FILE, files, priority); } int arg_enabled(int arg_count) { return arg_count % 2; } int main(int argc, char **argv, char **envp) { int exit_status = 0; int c; int W_count = 0; int c_count = 0; int P_count = 0; int qb_facility; const char *value = NULL; bool delay_start = false; long delay = 0; char *timeout_action = NULL; if ((cmdname = strrchr(argv[0], '/')) == NULL) { cmdname = argv[0]; } else { ++cmdname; } watchdogdev = strdup("/dev/watchdog"); watchdogdev_is_default = true; qb_facility = qb_log_facility2int("daemon"); qb_log_init(cmdname, qb_facility, LOG_WARNING); sbd_set_format_string(QB_LOG_SYSLOG, "sbd"); qb_log_ctl(QB_LOG_SYSLOG, QB_LOG_CONF_ENABLED, QB_TRUE); qb_log_ctl(QB_LOG_STDERR, QB_LOG_CONF_ENABLED, QB_FALSE); sbd_log_filter_ctl(NULL, LOG_NOTICE); sbd_get_uname(); value = get_env_option("SBD_PACEMAKER"); if(value) { check_pcmk = crm_is_true(value); check_cluster = crm_is_true(value); } cl_log(LOG_INFO, "Enable pacemaker checks: %d (%s)", (int)check_pcmk, value?value:"default"); value = get_env_option("SBD_STARTMODE"); if(value == NULL) { } else if(strcmp(value, "clean") == 0) { start_mode = 1; } else if(strcmp(value, "always") == 0) { start_mode = 0; } cl_log(LOG_INFO, "Start mode set to: %d (%s)", (int)start_mode, value?value:"default"); value = get_env_option("SBD_WATCHDOG_DEV"); if(value) { free(watchdogdev); watchdogdev = strdup(value); watchdogdev_is_default = false; } /* SBD_WATCHDOG has been dropped from sbd.sysconfig example. * This is for backward compatibility. */ value = get_env_option("SBD_WATCHDOG"); if(value) { watchdog_use = crm_is_true(value); } value = get_env_option("SBD_WATCHDOG_TIMEOUT"); if(value) { timeout_watchdog = crm_get_msec(value) / 1000; } value = get_env_option("SBD_PIDFILE"); if(value) { pidfile = strdup(value); cl_log(LOG_INFO, "pidfile set to %s", pidfile); } value = get_env_option("SBD_DELAY_START"); if(value) { delay_start = crm_is_true(value); if (!delay_start) { delay = crm_get_msec(value) / 1000; if (delay > 0) { delay_start = true; } } } value = get_env_option("SBD_TIMEOUT_ACTION"); if(value) { timeout_action = strdup(value); } value = get_env_option("SBD_MOVE_TO_ROOT_CGROUP"); if(value) { move_to_root_cgroup = crm_is_true(value); if (move_to_root_cgroup) { enforce_moving_to_root_cgroup = true; } else { if (strcmp(value, "auto") == 0) { move_to_root_cgroup = true; } } } while ((c = getopt(argc, argv, "czC:DPRTWZhvw:d:n:p:1:2:3:4:5:t:I:F:S:s:r:")) != -1) { int sanitized_num_optarg = 0; /* Call it before checking optarg for NULL to make coverity happy */ const char *sanitized_optarg = sanitize_option_value(optarg); if (optarg && ((sanitized_optarg == NULL) || (strchr("SsC12345tIF", c) && (sanitized_num_optarg = sanitize_numeric_option_value(sanitized_optarg)) < 0))) { fprintf(stderr, "Invalid value \"%s\" for option -%c\n", optarg, c); exit_status = -2; goto out; } switch (c) { case 'D': break; case 'Z': debug_mode++; cl_log(LOG_INFO, "Debug mode now at level %d", (int)debug_mode); break; case 'R': skip_rt = 1; cl_log(LOG_INFO, "Realtime mode deactivated."); break; case 'S': start_mode = sanitized_num_optarg; cl_log(LOG_INFO, "Start mode set to: %d", (int)start_mode); break; case 's': timeout_startup = sanitized_num_optarg; cl_log(LOG_INFO, "Start timeout set to: %d", (int)timeout_startup); break; case 'v': debug++; if(debug == 1) { sbd_log_filter_ctl(NULL, LOG_INFO); cl_log(LOG_INFO, "Verbose mode enabled."); } else if(debug == 2) { sbd_log_filter_ctl(NULL, LOG_DEBUG); cl_log(LOG_INFO, "Debug mode enabled."); } else if(debug == 3) { /* Go nuts, turn on pacemaker's logging too */ sbd_log_filter_ctl("*", LOG_DEBUG); cl_log(LOG_INFO, "Debug library mode enabled."); } break; case 'T': watchdog_set_timeout = 0; cl_log(LOG_INFO, "Setting watchdog timeout disabled; using defaults."); break; case 'W': W_count++; break; case 'w': free(watchdogdev); watchdogdev = strdup(sanitized_optarg); watchdogdev_is_default = false; cl_log(LOG_NOTICE, "Using watchdog device '%s'", watchdogdev); break; case 'd': #if SUPPORT_SHARED_DISK if (recruit_servant(sanitized_optarg, 0) != 0) { fprintf(stderr, "Invalid device: %s\n", optarg); exit_status = -1; goto out; } #else fprintf(stderr, "Shared disk functionality not supported\n"); exit_status = -2; goto out; #endif break; case 'c': c_count++; break; case 'P': P_count++; break; case 'z': disk_priority = 0; break; case 'n': local_uname = strdup(sanitized_optarg); cl_log(LOG_INFO, "Overriding local hostname to %s", local_uname); break; case 'p': pidfile = strdup(sanitized_optarg); cl_log(LOG_INFO, "pidfile set to %s", pidfile); break; case 'C': timeout_watchdog_crashdump = sanitized_num_optarg; cl_log(LOG_INFO, "Setting crashdump watchdog timeout to %d", (int)timeout_watchdog_crashdump); break; case '1': timeout_watchdog = sanitized_num_optarg; break; case '2': timeout_allocate = sanitized_num_optarg; break; case '3': timeout_loop = sanitized_num_optarg; break; case '4': timeout_msgwait = sanitized_num_optarg; break; case '5': timeout_watchdog_warn = sanitized_num_optarg; do_calculate_timeout_watchdog_warn = false; cl_log(LOG_INFO, "Setting latency warning to %d", (int)timeout_watchdog_warn); break; case 't': servant_restart_interval = sanitized_num_optarg; cl_log(LOG_INFO, "Setting servant restart interval to %d", (int)servant_restart_interval); break; case 'I': timeout_io = sanitized_num_optarg; cl_log(LOG_INFO, "Setting IO timeout to %d", (int)timeout_io); break; case 'F': servant_restart_count = sanitized_num_optarg; cl_log(LOG_INFO, "Servant restart count set to %d", (int)servant_restart_count); break; case 'r': if (timeout_action) { free(timeout_action); } timeout_action = strdup(sanitized_optarg); break; case 'h': usage(); goto out; break; default: exit_status = -2; goto out; break; } } if (disk_count == 0) { /* if we already have disks from commandline then it is probably undesirable to add those from environment (general rule cmdline has precedence) */ value = get_env_option("SBD_DEVICE"); if ((value) && strlen(value)) { #if SUPPORT_SHARED_DISK int devices = parse_device_line(value); if(devices < 1) { fprintf(stderr, "Invalid device line: %s\n", value); exit_status = -1; goto out; } #else fprintf(stderr, "Shared disk functionality not supported\n"); exit_status = -2; goto out; #endif } } if (watchdogdev == NULL || strcmp(watchdogdev, "/dev/null") == 0) { watchdog_use = 0; } else if (W_count > 0) { watchdog_use = arg_enabled(W_count); } if (watchdog_use) { cl_log(LOG_INFO, "Watchdog enabled."); } else { cl_log(LOG_INFO, "Watchdog disabled."); } if (c_count > 0) { check_cluster = arg_enabled(c_count); } if (P_count > 0) { check_pcmk = arg_enabled(P_count); } if ((disk_count > 0) && (strlen(local_uname) > SECTOR_NAME_MAX)) { fprintf(stderr, "Node name mustn't be longer than %d chars.\n", SECTOR_NAME_MAX); fprintf(stderr, "If uname is longer define a name to be used by sbd.\n"); exit_status = -1; goto out; } if (disk_count > 3) { fprintf(stderr, "You can specify up to 3 devices via the -d option.\n"); exit_status = -1; goto out; } /* There must at least be one command following the options: */ if ((argc - optind) < 1) { fprintf(stderr, "Not enough arguments.\n"); exit_status = -2; goto out; } if (init_set_proc_title(argc, argv, envp) < 0) { fprintf(stderr, "Allocation of proc title failed.\n"); exit_status = -1; goto out; } if (timeout_action) { char *p[2]; int i; char c; int nrflags = sscanf(timeout_action, "%m[a-z],%m[a-z]%c", &p[0], &p[1], &c); bool parse_error = (nrflags < 1) || (nrflags > 2); for (i = 0; (i < nrflags) && (i < 2); i++) { if (!strcmp(p[i], "reboot")) { timeout_sysrq_char = 'b'; } else if (!strcmp(p[i], "crashdump")) { timeout_sysrq_char = 'c'; } else if (!strcmp(p[i], "off")) { timeout_sysrq_char = 'o'; } else if (!strcmp(p[i], "flush")) { do_flush = true; } else if (!strcmp(p[i], "noflush")) { do_flush = false; } else { parse_error = true; } free(p[i]); } if (parse_error) { fprintf(stderr, "Failed to parse timeout-action \"%s\".\n", timeout_action); exit_status = -1; goto out; } } if (strcmp(argv[optind], "watch") == 0) { value = get_env_option("SBD_SYNC_RESOURCE_STARTUP"); sync_resource_startup = crm_is_true(value?value:SBD_SYNC_RESOURCE_STARTUP_DEFAULT); #if !USE_PACEMAKERD_API if (sync_resource_startup) { fprintf(stderr, "Failed to sync resource-startup as " "SBD was built against pacemaker not supporting pacemakerd-API.\n"); exit_status = -1; goto out; } #else if (!sync_resource_startup) { cl_log(LOG_WARNING, "SBD built against pacemaker supporting " "pacemakerd-API. Should think about enabling " "SBD_SYNC_RESOURCE_STARTUP."); } #endif } #if SUPPORT_SHARED_DISK if (strcmp(argv[optind], "create") == 0) { exit_status = init_devices(servants_leader); } else if (strcmp(argv[optind], "dump") == 0) { exit_status = dump_headers(servants_leader); } else if (strcmp(argv[optind], "allocate") == 0) { exit_status = allocate_slots(argv[optind + 1], servants_leader); } else if (strcmp(argv[optind], "list") == 0) { exit_status = list_slots(servants_leader); } else if (strcmp(argv[optind], "message") == 0) { exit_status = messenger(argv[optind + 1], argv[optind + 2], servants_leader); } else if (strcmp(argv[optind], "ping") == 0) { exit_status = ping_via_slots(argv[optind + 1], servants_leader); } else #endif if (strcmp(argv[optind], "query-watchdog") == 0) { exit_status = watchdog_info(); } else if (strcmp(argv[optind], "test-watchdog") == 0) { exit_status = watchdog_test(); } else if (strcmp(argv[optind], "watch") == 0) { /* sleep $(sbd $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null */ const char *delay_source = delay ? "SBD_DELAY_START" : ""; #if SUPPORT_SHARED_DISK if(disk_count > 0) { /* If no devices are specified, its not an error to be unable to find one */ open_any_device(servants_leader); if (delay_start && delay <= 0) { delay = get_first_msgwait(servants_leader); if (delay > 0) { delay_source = "msgwait"; } else { cl_log(LOG_WARNING, "No 'msgwait' value from disk, using '2 * watchdog-timeout' for 'delay' starting"); } } } #endif /* Re-calculate timeout_watchdog_warn based on any timeout_watchdog from: * SBD_WATCHDOG_TIMEOUT, -1 option or on-disk setting read with open_any_device() */ if (do_calculate_timeout_watchdog_warn) { timeout_watchdog_warn = calculate_timeout_watchdog_warn(timeout_watchdog); } if (delay_start) { /* diskless mode or disk read issues causing get_first_msgwait() to return a 0 for delay */ if (delay <= 0) { delay = 2 * timeout_watchdog; delay_source = "watchdog-timeout * 2"; } cl_log(LOG_DEBUG, "Delay start (yes), (delay: %ld), (delay source: %s)", delay, delay_source); sleep((unsigned long) delay); } else { cl_log(LOG_DEBUG, "Delay start (no)"); } /* We only want this to have an effect during watch right now; * pinging and fencing would be too confused */ cl_log(LOG_INFO, "Turning on pacemaker checks: %d", check_pcmk); if (check_pcmk) { recruit_servant("pcmk", 0); #if SUPPORT_PLUGIN check_cluster = 1; #endif } cl_log(LOG_INFO, "Turning on cluster checks: %d", check_cluster); if (check_cluster) { recruit_servant("cluster", 0); } cl_log(LOG_NOTICE, "%s flush + write \'%c\' to sysrq in case of timeout", do_flush?"Do":"Skip", timeout_sysrq_char); exit_status = inquisitor(); } else { exit_status = -2; } out: if (timeout_action) { free(timeout_action); } if (exit_status < 0) { if (exit_status == -2) { usage(); } else { fprintf(stderr, "sbd failed; please check the logs.\n"); } return (1); } return (0); } sbd-1.5.1/src/sbd-md.c000066400000000000000000000735431414464774600144240ustar00rootroot00000000000000/* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include "sbd.h" #define SBD_MSG_EMPTY 0x00 #define SBD_MSG_TEST 0x01 #define SBD_MSG_RESET 0x02 #define SBD_MSG_OFF 0x03 #define SBD_MSG_EXIT 0x04 #define SBD_MSG_CRASHDUMP 0x05 #define SLOT_TO_SECTOR(slot) (1+slot*2) #define MBOX_TO_SECTOR(mbox) (2+mbox*2) extern int disk_count; /* These have to match the values in the header of the partition */ static char sbd_magic[8] = "SBD_SBD_"; static char sbd_version = 0x02; struct slot_msg_arg_t { const char* name; const char* msg; }; static signed char cmd2char(const char *cmd) { if (strcmp("clear", cmd) == 0) { return SBD_MSG_EMPTY; } else if (strcmp("test", cmd) == 0) { return SBD_MSG_TEST; } else if (strcmp("reset", cmd) == 0) { return SBD_MSG_RESET; } else if (strcmp("off", cmd) == 0) { return SBD_MSG_OFF; } else if (strcmp("exit", cmd) == 0) { return SBD_MSG_EXIT; } else if (strcmp("crashdump", cmd) == 0) { return SBD_MSG_CRASHDUMP; } return -1; } static const char* char2cmd(const char cmd) { switch (cmd) { case SBD_MSG_EMPTY: return "clear"; break; case SBD_MSG_TEST: return "test"; break; case SBD_MSG_RESET: return "reset"; break; case SBD_MSG_OFF: return "off"; break; case SBD_MSG_EXIT: return "exit"; break; case SBD_MSG_CRASHDUMP: return "crashdump"; break; default: return "undefined"; break; } } static void close_device(struct sbd_context *st) { if (!st) { return; } if (st->ioctx) { io_destroy(st->ioctx); } if (st->devfd >= 0) { close(st->devfd); } free(st->buffer); free(st); } static struct sbd_context * open_device(const char* devname, int loglevel) { struct sbd_context *st; if (!devname) return NULL; st = calloc(1, sizeof(struct sbd_context)); if (!st) { return NULL; } st->devfd = -1; if (io_setup(1, &st->ioctx) != 0) { cl_perror("io_setup failed"); goto out; } st->devfd = open(devname, O_SYNC|O_RDWR|O_DIRECT); if (st->devfd == -1) { if (loglevel == LOG_DEBUG) { DBGLOG(loglevel, "Opening device %s failed.", devname); } else { cl_log(loglevel, "Opening device %s failed.", devname); } goto out; } ioctl(st->devfd, BLKSSZGET, §or_size); if (sector_size == 0) { cl_perror("Get sector size failed.\n"); goto out; } if (posix_memalign(&st->buffer, sector_size, sector_size)) { cl_perror("Couldn't allocate sector-buffer."); goto out; } return st; out: close_device(st); return NULL; } static void * sector_alloc(void) { void *x; x = calloc(1, sector_size); if (!x) { exit(1); } return x; } static int sector_io(struct sbd_context *st, int sector, void *data, int rw) { struct timespec timeout; struct io_event event; struct iocb *ios[1] = { &st->io }; long r; timeout.tv_sec = timeout_io; timeout.tv_nsec = 0; memset(&st->io, 0, sizeof(struct iocb)); if (rw) { memcpy(st->buffer, data, sector_size); io_prep_pwrite(&st->io, st->devfd, st->buffer, sector_size, (long long) sector_size * sector); } else { memset(st->buffer, 0, sector_size); io_prep_pread(&st->io, st->devfd, st->buffer, sector_size, (long long) sector_size * sector); } if (io_submit(st->ioctx, 1, ios) != 1) { cl_log(LOG_ERR, "Failed to submit IO request! (rw=%d)", rw); return -1; } errno = 0; r = io_getevents(st->ioctx, 1L, 1L, &event, &timeout); if (r < 0 ) { cl_log(LOG_ERR, "Failed to retrieve IO events (rw=%d)", rw); return -1; } else if (r < 1L) { cl_log(LOG_INFO, "Cancelling IO request due to timeout (rw=%d, r=%ld)", rw, r); r = io_cancel(st->ioctx, ios[0], &event); if (r) { DBGLOG(LOG_INFO, "Could not cancel IO request (rw=%d)", rw); /* Doesn't really matter, debugging information. */ } return -1; } else if (r > 1L) { cl_log(LOG_ERR, "More than one IO was returned (r=%ld)", r); return -1; } /* IO is happy */ if (event.res == sector_size) { if (!rw) { memcpy(data, st->buffer, sector_size); } return 0; } else { cl_log(LOG_ERR, "Short IO (rw=%d, res=%lu, sector_size=%d)", rw, event.res, sector_size); return -1; } } static int sector_write(struct sbd_context *st, int sector, void *data) { return sector_io(st, sector, data, 1); } static int sector_read(struct sbd_context *st, int sector, void *data) { return sector_io(st, sector, data, 0); } static int slot_read(struct sbd_context *st, int slot, struct sector_node_s *s_node) { return sector_read(st, SLOT_TO_SECTOR(slot), s_node); } static int slot_write(struct sbd_context *st, int slot, struct sector_node_s *s_node) { return sector_write(st, SLOT_TO_SECTOR(slot), s_node); } static int mbox_write(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) { return sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox); } static int mbox_read(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) { return sector_read(st, MBOX_TO_SECTOR(mbox), s_mbox); } static int mbox_write_verify(struct sbd_context *st, int mbox, struct sector_mbox_s *s_mbox) { void *data; int rc = 0; if (sector_write(st, MBOX_TO_SECTOR(mbox), s_mbox) < 0) return -1; data = sector_alloc(); if (sector_read(st, MBOX_TO_SECTOR(mbox), data) < 0) { rc = -1; goto out; } if (memcmp(s_mbox, data, sector_size) != 0) { cl_log(LOG_ERR, "Write verification failed!"); rc = -1; goto out; } rc = 0; out: free(data); return rc; } static int header_write(struct sbd_context *st, struct sector_header_s *s_header) { s_header->sector_size = htonl(s_header->sector_size); s_header->timeout_watchdog = htonl(s_header->timeout_watchdog); s_header->timeout_allocate = htonl(s_header->timeout_allocate); s_header->timeout_loop = htonl(s_header->timeout_loop); s_header->timeout_msgwait = htonl(s_header->timeout_msgwait); return sector_write(st, 0, s_header); } static int header_read(struct sbd_context *st, struct sector_header_s *s_header) { if (sector_read(st, 0, s_header) < 0) return -1; s_header->sector_size = ntohl(s_header->sector_size); s_header->timeout_watchdog = ntohl(s_header->timeout_watchdog); s_header->timeout_allocate = ntohl(s_header->timeout_allocate); s_header->timeout_loop = ntohl(s_header->timeout_loop); s_header->timeout_msgwait = ntohl(s_header->timeout_msgwait); /* This sets the global defaults: */ timeout_watchdog = s_header->timeout_watchdog; timeout_allocate = s_header->timeout_allocate; timeout_loop = s_header->timeout_loop; timeout_msgwait = s_header->timeout_msgwait; return 0; } static int valid_header(const struct sector_header_s *s_header) { if (memcmp(s_header->magic, sbd_magic, sizeof(s_header->magic)) != 0) { cl_log(LOG_ERR, "Header magic does not match."); return -1; } if (s_header->version != sbd_version) { cl_log(LOG_ERR, "Header version does not match."); return -1; } if (s_header->sector_size != sector_size) { cl_log(LOG_ERR, "Header sector size does not match."); return -1; } return 0; } static struct sector_header_s * header_get(struct sbd_context *st) { struct sector_header_s *s_header; s_header = sector_alloc(); if (header_read(st, s_header) < 0) { cl_log(LOG_ERR, "Unable to read header from device %d", st->devfd); free(s_header); return NULL; } if (valid_header(s_header) < 0) { cl_log(LOG_ERR, "header on device %d is not valid.", st->devfd); free(s_header); return NULL; } /* cl_log(LOG_INFO, "Found version %d header with %d slots", s_header->version, s_header->slots); */ return s_header; } static int header_dump(struct sbd_context *st) { struct sector_header_s *s_header; char uuid[37]; s_header = header_get(st); if (s_header == NULL) return -1; printf("Header version : %u.%u\n", s_header->version, s_header->minor_version); if (s_header->minor_version > 0) { uuid_unparse_lower(s_header->uuid, uuid); printf("UUID : %s\n", uuid); } printf("Number of slots : %u\n", s_header->slots); printf("Sector size : %lu\n", (unsigned long)s_header->sector_size); printf("Timeout (watchdog) : %lu\n", (unsigned long)s_header->timeout_watchdog); printf("Timeout (allocate) : %lu\n", (unsigned long)s_header->timeout_allocate); printf("Timeout (loop) : %lu\n", (unsigned long)s_header->timeout_loop); printf("Timeout (msgwait) : %lu\n", (unsigned long)s_header->timeout_msgwait); free(s_header); return 0; } static int init_device(struct sbd_context *st) { struct sector_header_s *s_header; struct sector_node_s *s_node; struct sector_mbox_s *s_mbox; char uuid[37]; int i; int rc = 0; s_header = sector_alloc(); s_node = sector_alloc(); s_mbox = sector_alloc(); memcpy(s_header->magic, sbd_magic, sizeof(s_header->magic)); s_header->version = sbd_version; s_header->slots = 255; s_header->sector_size = sector_size; s_header->timeout_watchdog = timeout_watchdog; s_header->timeout_allocate = timeout_allocate; s_header->timeout_loop = timeout_loop; s_header->timeout_msgwait = timeout_msgwait; s_header->minor_version = 1; uuid_generate(s_header->uuid); uuid_unparse_lower(s_header->uuid, uuid); cl_log(LOG_INFO, "Creating version %d.%d header on device %d (uuid: %s)", s_header->version, s_header->minor_version, st->devfd, uuid); fprintf(stdout, "Creating version %d.%d header on device %d (uuid: %s)\n", s_header->version, s_header->minor_version, st->devfd, uuid); if (header_write(st, s_header) < 0) { rc = -1; goto out; } cl_log(LOG_INFO, "Initializing %d slots on device %d", s_header->slots, st->devfd); fprintf(stdout, "Initializing %d slots on device %d\n", s_header->slots, st->devfd); for (i=0;i < s_header->slots;i++) { if (slot_write(st, i, s_node) < 0) { rc = -1; goto out; } if (mbox_write(st, i, s_mbox) < 0) { rc = -1; goto out; } } out: free(s_node); free(s_header); free(s_mbox); return(rc); } /* Check if there already is a slot allocated to said name; returns the * slot number. If not found, returns -1. * This is necessary because slots might not be continuous. */ static int slot_lookup(struct sbd_context *st, const struct sector_header_s *s_header, const char *name) { struct sector_node_s *s_node = NULL; int i; int rc = -1; if (!name) { cl_log(LOG_ERR, "slot_lookup(): No name specified.\n"); goto out; } s_node = sector_alloc(); for (i=0; i < s_header->slots; i++) { if (slot_read(st, i, s_node) < 0) { rc = -2; goto out; } if (s_node->in_use != 0) { if (strncasecmp(s_node->name, name, SECTOR_NAME_MAX) == 0) { DBGLOG(LOG_INFO, "%s owns slot %d", name, i); rc = i; goto out; } } } out: free(s_node); return rc; } static int slot_unused(struct sbd_context *st, const struct sector_header_s *s_header) { struct sector_node_s *s_node; int i; int rc = -1; s_node = sector_alloc(); for (i=0; i < s_header->slots; i++) { if (slot_read(st, i, s_node) < 0) { rc = -1; goto out; } if (s_node->in_use == 0) { rc = i; goto out; } } out: free(s_node); return rc; } static int slot_allocate(struct sbd_context *st, const char *name) { struct sector_header_s *s_header = NULL; struct sector_node_s *s_node = NULL; struct sector_mbox_s *s_mbox = NULL; int i; int rc = 0; if (!name) { cl_log(LOG_ERR, "slot_allocate(): No name specified.\n"); fprintf(stderr, "slot_allocate(): No name specified.\n"); rc = -1; goto out; } s_header = header_get(st); if (!s_header) { rc = -1; goto out; } s_node = sector_alloc(); s_mbox = sector_alloc(); while (1) { i = slot_lookup(st, s_header, name); if ((i >= 0) || (i == -2)) { /* -1 is "no slot found", in which case we * proceed to allocate a new one. * -2 is "read error during lookup", in which * case we error out too * >= 0 is "slot already allocated" */ rc = i; goto out; } i = slot_unused(st, s_header); if (i >= 0) { cl_log(LOG_INFO, "slot %d is unused - trying to own", i); fprintf(stdout, "slot %d is unused - trying to own\n", i); memset(s_node, 0, sizeof(*s_node)); s_node->in_use = 1; strncpy(s_node->name, name, SECTOR_NAME_MAX); if (slot_write(st, i, s_node) < 0) { rc = -1; goto out; } sleep(timeout_allocate); } else { cl_log(LOG_ERR, "No more free slots."); fprintf(stderr, "No more free slots.\n"); rc = -1; goto out; } } out: free(s_node); free(s_header); free(s_mbox); return(rc); } static int slot_list(struct sbd_context *st) { struct sector_header_s *s_header = NULL; struct sector_node_s *s_node = NULL; struct sector_mbox_s *s_mbox = NULL; int i; int rc = 0; s_header = header_get(st); if (!s_header) { rc = -1; goto out; } s_node = sector_alloc(); s_mbox = sector_alloc(); for (i=0; i < s_header->slots; i++) { if (slot_read(st, i, s_node) < 0) { rc = -1; goto out; } if (s_node->in_use > 0) { if (mbox_read(st, i, s_mbox) < 0) { rc = -1; goto out; } printf("%d\t%s\t%s\t%s\n", i, s_node->name, char2cmd(s_mbox->cmd), s_mbox->from); } } out: free(s_node); free(s_header); free(s_mbox); return rc; } static int slot_msg(struct sbd_context *st, const char *name, const char *cmd) { struct sector_header_s *s_header = NULL; struct sector_mbox_s *s_mbox = NULL; int mbox; int rc = 0; char uuid[37]; if (!name || !cmd) { cl_log(LOG_ERR, "slot_msg(): No recipient / cmd specified.\n"); rc = -1; goto out; } s_header = header_get(st); if (!s_header) { rc = -1; goto out; } if (strcmp(name, "LOCAL") == 0) { name = local_uname; } if (s_header->minor_version > 0) { uuid_unparse_lower(s_header->uuid, uuid); cl_log(LOG_INFO, "Device UUID: %s", uuid); } mbox = slot_lookup(st, s_header, name); if (mbox < 0) { cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name); rc = -1; goto out; } s_mbox = sector_alloc(); s_mbox->cmd = cmd2char(cmd); if (s_mbox->cmd < 0) { cl_log(LOG_ERR, "slot_msg(): Invalid command %s.", cmd); rc = -1; goto out; } strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX); cl_log(LOG_INFO, "Writing %s to node slot %s", cmd, name); if (mbox_write_verify(st, mbox, s_mbox) < -1) { rc = -1; goto out; } if (strcasecmp(cmd, "exit") != 0) { cl_log(LOG_INFO, "Messaging delay: %d", (int)timeout_msgwait); sleep(timeout_msgwait); } cl_log(LOG_INFO, "%s successfully delivered to %s", cmd, name); out: free(s_mbox); free(s_header); return rc; } static int slot_ping(struct sbd_context *st, const char *name) { struct sector_header_s *s_header = NULL; struct sector_mbox_s *s_mbox = NULL; int mbox; int waited = 0; int rc = 0; if (!name) { cl_log(LOG_ERR, "slot_ping(): No recipient specified.\n"); rc = -1; goto out; } s_header = header_get(st); if (!s_header) { rc = -1; goto out; } if (strcmp(name, "LOCAL") == 0) { name = local_uname; } mbox = slot_lookup(st, s_header, name); if (mbox < 0) { cl_log(LOG_ERR, "slot_msg(): No slot found for %s.", name); rc = -1; goto out; } s_mbox = sector_alloc(); s_mbox->cmd = SBD_MSG_TEST; strncpy(s_mbox->from, local_uname, SECTOR_NAME_MAX); DBGLOG(LOG_DEBUG, "Pinging node %s", name); if (mbox_write(st, mbox, s_mbox) < -1) { rc = -1; goto out; } rc = -1; while (waited <= timeout_msgwait) { if (mbox_read(st, mbox, s_mbox) < 0) break; if (s_mbox->cmd != SBD_MSG_TEST) { rc = 0; break; } sleep(1); waited++; } if (rc == 0) { cl_log(LOG_DEBUG, "%s successfully pinged.", name); } else { cl_log(LOG_ERR, "%s failed to ping.", name); } out: free(s_mbox); free(s_header); return rc; } int init_devices(struct servants_list_item *servants) { int rc = 0; struct sbd_context *st; struct servants_list_item *s; for (s = servants; s; s = s->next) { fprintf(stdout, "Initializing device %s\n", s->devname); st = open_device(s->devname, LOG_ERR); if (!st) { return -1; } rc = init_device(st); close_device(st); if (rc == -1) { fprintf(stderr, "Failed to init device %s\n", s->devname); return rc; } fprintf(stdout, "Device %s is initialized.\n", s->devname); } fprintf(stdout, "Did you check sbd service down on all nodes before? If not do so now and restart afterwards.\n"); return 0; } static int slot_msg_wrapper(const char* devname, int mode, const void* argp) { int rc = 0; struct sbd_context *st; const struct slot_msg_arg_t* arg = (const struct slot_msg_arg_t*)argp; st = open_device(devname, LOG_WARNING); if (!st) return -1; cl_log(LOG_INFO, "Delivery process handling %s", devname); rc = slot_msg(st, arg->name, arg->msg); close_device(st); return rc; } static int slot_ping_wrapper(const char* devname, int mode, const void* argp) { int rc = 0; const char* name = (const char*)argp; struct sbd_context *st; st = open_device(devname, LOG_WARNING); if (!st) return -1; rc = slot_ping(st, name); close_device(st); return rc; } int allocate_slots(const char *name, struct servants_list_item *servants) { int rc = 0; struct sbd_context *st; struct servants_list_item *s; for (s = servants; s; s = s->next) { fprintf(stdout, "Trying to allocate slot for %s on device %s.\n", name, s->devname); st = open_device(s->devname, LOG_WARNING); if (!st) { return -1; } rc = slot_allocate(st, name); close_device(st); if (rc < 0) return rc; fprintf(stdout, "Slot for %s has been allocated on %s.\n", name, s->devname); } return 0; } int list_slots(struct servants_list_item *servants) { int rc = 0; struct servants_list_item *s; struct sbd_context *st; for (s = servants; s; s = s->next) { int rv = 0; st = open_device(s->devname, LOG_WARNING); if (!st) { rc = -1; fprintf(stderr, "== disk %s unreadable!\n", s->devname); continue; } rv = slot_list(st); close_device(st); if (rv == -1) { rc = -1; fprintf(stderr, "== Slots on disk %s NOT dumped\n", s->devname); } } return rc; } int ping_via_slots(const char *name, struct servants_list_item *servants) { int sig = 0; pid_t pid = 0; int status = 0; int servants_finished = 0; sigset_t procmask; siginfo_t sinfo; struct servants_list_item *s; sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigprocmask(SIG_BLOCK, &procmask, NULL); for (s = servants; s; s = s->next) { if(sbd_is_disk(s)) { s->pid = assign_servant(s->devname, &slot_ping_wrapper, 0, (const void*)name); } } while (servants_finished < disk_count) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = wait(&status))) { if (pid == -1 && errno == ECHILD) { break; } else { s = lookup_servant_by_pid(pid); if (sbd_is_disk(s)) { servants_finished++; } } } } } return 0; } int quorum_write(int good_servants) { return (good_servants > disk_count/2); } int messenger(const char *name, const char *msg, struct servants_list_item *servants) { int sig = 0; pid_t pid = 0; int status = 0; int servants_finished = 0; int successful_delivery = 0; sigset_t procmask; siginfo_t sinfo; struct servants_list_item *s; struct slot_msg_arg_t slot_msg_arg = {name, msg}; sigemptyset(&procmask); sigaddset(&procmask, SIGCHLD); sigprocmask(SIG_BLOCK, &procmask, NULL); for (s = servants; s; s = s->next) { s->pid = assign_servant(s->devname, &slot_msg_wrapper, 0, &slot_msg_arg); } while (!(quorum_write(successful_delivery) || (servants_finished == disk_count))) { sig = sigwaitinfo(&procmask, &sinfo); if (sig == SIGCHLD) { while ((pid = waitpid(-1, &status, WNOHANG))) { if (pid == -1 && errno == ECHILD) { break; } else { servants_finished++; if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { DBGLOG(LOG_INFO, "Process %d succeeded.", (int)pid); successful_delivery++; } else { cl_log(LOG_WARNING, "Process %d failed to deliver!", (int)pid); } } } } } if (quorum_write(successful_delivery)) { cl_log(LOG_INFO, "Message successfully delivered."); return 0; } else { cl_log(LOG_ERR, "Message is not delivered via more then a half of devices"); return -1; } } unsigned long get_first_msgwait(struct servants_list_item *servants) { unsigned long msgwait = 0; struct servants_list_item *s = servants; for (s = servants; s; s = s->next) { struct sbd_context *st; struct sector_header_s *s_header; st = open_device(s->devname, LOG_WARNING); if (!st) { continue; } s_header = header_get(st); if (s_header != NULL) { msgwait = (unsigned long)s_header->timeout_msgwait; close_device(st); free(s_header); return msgwait; } close_device(st); } return msgwait; } int dump_headers(struct servants_list_item *servants) { int rc = 0; struct servants_list_item *s = servants; struct sbd_context *st; for (s = servants; s; s = s->next) { int rv; fprintf(stdout, "==Dumping header on disk %s\n", s->devname); st = open_device(s->devname, LOG_WARNING); if (st) { rv = header_dump(st); close_device(st); } else { fprintf(stderr, "== disk %s unreadable!\n", s->devname); rv = -1; } if (rv == -1) { rc = -1; fprintf(stderr, "==Header on disk %s NOT dumped\n", s->devname); } else { fprintf(stdout, "==Header on disk %s is dumped\n", s->devname); } } return rc; } void open_any_device(struct servants_list_item *servants) { struct sector_header_s *hdr_cur = NULL; struct timespec t_0; int t_wait = 0; bool logged_once = false; clock_gettime(CLOCK_MONOTONIC, &t_0); while (!hdr_cur && t_wait < timeout_startup) { struct timespec t_now; struct servants_list_item* s; for (s = servants; s; s = s->next) { struct sbd_context *st = open_device(s->devname, LOG_DEBUG); if (!st) { if (logged_once == false) { cl_log(LOG_WARNING, "Failed to open %s. " "Trying any other configured devices, " "otherwise retrying every %ds within %ds", s->devname, timeout_loop, timeout_startup); logged_once = true; } continue; } hdr_cur = header_get(st); close_device(st); if (hdr_cur) { break; } else { if (logged_once == false) { cl_log(LOG_WARNING, "Failed to read header from %s. " "Trying any other configured devices, " "otherwise retrying every %ds within %ds", s->devname, timeout_loop, timeout_startup); logged_once = true; } } } clock_gettime(CLOCK_MONOTONIC, &t_now); t_wait = t_now.tv_sec - t_0.tv_sec; if (!hdr_cur) { sleep(timeout_loop); } } if (hdr_cur) { timeout_watchdog = hdr_cur->timeout_watchdog; timeout_allocate = hdr_cur->timeout_allocate; timeout_loop = hdr_cur->timeout_loop; timeout_msgwait = hdr_cur->timeout_msgwait; } else { cl_log(LOG_ERR, "No devices were available at start-up within %i seconds.", timeout_startup); exit(1); } free(hdr_cur); return; } /* ::-::-::-::-::-::-::-::-::-::-::-::-:: Begin disk based servant code ::-::-::-::-::-::-::-::-::-::-::-::-:: */ static int servant_check_timeout_inconsistent(struct sector_header_s *hdr) { if (timeout_watchdog != hdr->timeout_watchdog) { cl_log(LOG_WARNING, "watchdog timeout: %d versus %d on this device", (int)timeout_watchdog, (int)hdr->timeout_watchdog); return -1; } if (timeout_allocate != hdr->timeout_allocate) { cl_log(LOG_WARNING, "allocate timeout: %d versus %d on this device", (int)timeout_allocate, (int)hdr->timeout_allocate); return -1; } if (timeout_loop != hdr->timeout_loop) { cl_log(LOG_WARNING, "loop timeout: %d versus %d on this device", (int)timeout_loop, (int)hdr->timeout_loop); return -1; } if (timeout_msgwait != hdr->timeout_msgwait) { cl_log(LOG_WARNING, "msgwait timeout: %d versus %d on this device", (int)timeout_msgwait, (int)hdr->timeout_msgwait); return -1; } return 0; } int servant_md(const char *diskname, int mode, const void* argp) { struct sector_mbox_s *s_mbox = NULL; struct sector_node_s *s_node = NULL; struct sector_header_s *s_header = NULL; int mbox; int rc = 0; time_t t0, t1, latency; union sigval signal_value; sigset_t servant_masks; struct sbd_context *st; pid_t ppid; char uuid[37]; const struct servants_list_item *s = argp; cl_log(LOG_INFO, "Servant starting for device %s", diskname); /* Block most of the signals */ sigfillset(&servant_masks); sigdelset(&servant_masks, SIGKILL); sigdelset(&servant_masks, SIGFPE); sigdelset(&servant_masks, SIGILL); sigdelset(&servant_masks, SIGSEGV); sigdelset(&servant_masks, SIGBUS); sigdelset(&servant_masks, SIGALRM); /* FIXME: check error */ sigprocmask(SIG_SETMASK, &servant_masks, NULL); st = open_device(diskname, LOG_WARNING); if (!st) { exit(EXIT_MD_SERVANT_IO_FAIL); } s_header = header_get(st); if (!s_header) { cl_log(LOG_ERR, "Not a valid header on %s", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (servant_check_timeout_inconsistent(s_header) < 0) { cl_log(LOG_ERR, "Timeouts on %s do not match first device", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (s_header->minor_version > 0) { uuid_unparse_lower(s_header->uuid, uuid); cl_log(LOG_INFO, "Device %s uuid: %s", diskname, uuid); } mbox = slot_allocate(st, local_uname); if (mbox < 0) { cl_log(LOG_ERR, "No slot allocated, and automatic allocation failed for disk %s.", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } s_node = sector_alloc(); if (slot_read(st, mbox, s_node) < 0) { cl_log(LOG_ERR, "Unable to read node entry on %s", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } cl_log(LOG_NOTICE, "Monitoring slot %d on disk %s", mbox, diskname); if (s_header->minor_version == 0) { set_proc_title("sbd: watcher: %s - slot: %d", diskname, mbox); } else { set_proc_title("sbd: watcher: %s - slot: %d - uuid: %s", diskname, mbox, uuid); } s_mbox = sector_alloc(); if (s->first_start) { if (mode > 0) { if (mbox_read(st, mbox, s_mbox) < 0) { cl_log(LOG_ERR, "mbox read failed during start-up in servant."); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (s_mbox->cmd != SBD_MSG_EXIT && s_mbox->cmd != SBD_MSG_EMPTY) { /* Not a clean stop. Abort start-up */ cl_log(LOG_WARNING, "Found fencing message - aborting start-up. Manual intervention required!"); ppid = getppid(); sigqueue(ppid, SIG_EXITREQ, signal_value); rc = 0; goto out; } } DBGLOG(LOG_INFO, "First servant start - zeroing inbox"); memset(s_mbox, 0, sizeof(*s_mbox)); if (mbox_write(st, mbox, s_mbox) < 0) { rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } } memset(&signal_value, 0, sizeof(signal_value)); while (1) { struct sector_header_s *s_header_retry = NULL; struct sector_node_s *s_node_retry = NULL; t0 = time(NULL); sleep(timeout_loop); ppid = getppid(); if (ppid == 1) { /* Our parent died unexpectedly. Triggering * self-fence. */ do_timeout_action(); } /* These attempts are, by definition, somewhat racy. If * the device is wiped out or corrupted between here and * us reading our mbox, there is nothing we can do about * that. But at least we tried. */ s_header_retry = header_get(st); if (!s_header_retry) { cl_log(LOG_ERR, "No longer found a valid header on %s", diskname); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (memcmp(s_header, s_header_retry, sizeof(*s_header)) != 0) { cl_log(LOG_ERR, "Header on %s changed since start-up!", diskname); free(s_header_retry); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } free(s_header_retry); s_node_retry = sector_alloc(); if (slot_read(st, mbox, s_node_retry) < 0) { cl_log(LOG_ERR, "slot read failed in servant."); free(s_node_retry); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (memcmp(s_node, s_node_retry, sizeof(*s_node)) != 0) { cl_log(LOG_ERR, "Node entry on %s changed since start-up!", diskname); free(s_node_retry); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } free(s_node_retry); if (mbox_read(st, mbox, s_mbox) < 0) { cl_log(LOG_ERR, "mbox read failed in servant."); rc = EXIT_MD_SERVANT_IO_FAIL; goto out; } if (s_mbox->cmd > 0) { cl_log(LOG_NOTICE, "Received command %s from %s on disk %s", char2cmd(s_mbox->cmd), s_mbox->from, diskname); switch (s_mbox->cmd) { case SBD_MSG_TEST: memset(s_mbox, 0, sizeof(*s_mbox)); mbox_write(st, mbox, s_mbox); sigqueue(ppid, SIG_TEST, signal_value); break; case SBD_MSG_RESET: rc = EXIT_MD_SERVANT_REQUEST_RESET; goto out; case SBD_MSG_OFF: rc = EXIT_MD_SERVANT_REQUEST_SHUTOFF; goto out; case SBD_MSG_EXIT: sigqueue(ppid, SIG_EXITREQ, signal_value); break; case SBD_MSG_CRASHDUMP: rc = EXIT_MD_SERVANT_REQUEST_CRASHDUMP; goto out; default: /* FIXME: An "unknown" message might result from a partial write. log it and clear the slot. */ cl_log(LOG_ERR, "Unknown message on disk %s", diskname); memset(s_mbox, 0, sizeof(*s_mbox)); mbox_write(st, mbox, s_mbox); break; } } sigqueue(ppid, SIG_LIVENESS, signal_value); t1 = time(NULL); latency = t1 - t0; if (timeout_watchdog_warn && (latency > timeout_watchdog_warn)) { cl_log(LOG_WARNING, "Latency: %ds exceeded watchdog warning timeout %ds on disk %s", (int)latency, (int)timeout_watchdog_warn, diskname); } else if (debug) { DBGLOG(LOG_DEBUG, "Latency: %ds on disk %s", (int)latency, diskname); } } out: free(s_header); free(s_node); free(s_mbox); close_device(st); exit(rc); } sbd-1.5.1/src/sbd-pacemaker.c000066400000000000000000000434531414464774600157510ustar00rootroot00000000000000/* * Copyright (C) 2013 Lars Marowsky-Bree * * Based on crm_mon.c, which was: * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ /* TODO list: * * - Trying to shutdown a node if no devices are up will fail, since SBD * currently uses a message via the disk to achieve this. * * - Shutting down cluster nodes while the majority of devices is down * will eventually take the cluster below the quorum threshold, at which * time the remaining cluster nodes will all immediately suicide. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sbd.h" #ifndef HAVE_PE_NEW_WORKING_SET #define pe_reset_working_set(data_set) cleanup_calculations(data_set) static pe_working_set_t * pe_new_working_set() { pe_working_set_t *data_set = calloc(1, sizeof(pe_working_set_t)); if (data_set != NULL) { set_working_set_defaults(data_set); } return data_set; } static void pe_free_working_set(pe_working_set_t *data_set) { if (data_set != NULL) { pe_reset_working_set(data_set); free(data_set); } } #endif static void clean_up(int rc); #if USE_PACEMAKERD_API #include static pcmk_ipc_api_t *pacemakerd_api = NULL; static time_t last_ok = (time_t) 0; static void pacemakerd_event_cb(pcmk_ipc_api_t *pacemakerd_api, enum pcmk_ipc_event event_type, crm_exit_t status, void *event_data, void *user_data) { pcmk_pacemakerd_api_reply_t *reply = event_data; switch (event_type) { case pcmk_ipc_event_disconnect: /* Unexpected */ cl_log(LOG_ERR, "Lost connection to pacemakerd\n"); return; case pcmk_ipc_event_reply: break; default: return; } if (status != CRM_EX_OK) { cl_log(LOG_ERR, "Bad reply from pacemakerd: %s", crm_exit_str(status)); return; } if (reply->reply_type != pcmk_pacemakerd_reply_ping) { cl_log(LOG_ERR, "Unknown reply type %d from pacemakerd\n", reply->reply_type); } else { if ((reply->data.ping.last_good != (time_t) 0) && (reply->data.ping.status == pcmk_rc_ok)) { switch (reply->data.ping.state) { case pcmk_pacemakerd_state_running: case pcmk_pacemakerd_state_shutting_down: last_ok = reply->data.ping.last_good; break; case pcmk_pacemakerd_state_shutdown_complete: clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN); break; default: break; } } } } #endif extern int disk_count; static void clean_up(int rc); static void crm_diff_update(const char *event, xmlNode * msg); static int cib_connect(gboolean full); static void compute_status(pe_working_set_t * data_set); static gboolean mon_refresh_state(gpointer user_data); static GMainLoop *mainloop = NULL; static guint timer_id_reconnect = 0; static guint timer_id_notify = 0; static int reconnect_msec = 1000; static int cib_connected = 0; static cib_t *cib = NULL; static xmlNode *current_cib = NULL; static pe_working_set_t *data_set = NULL; static long last_refresh = 0; static int pcmk_clean_shutdown = 0; static int pcmk_shutdown = 0; static gboolean mon_timer_reconnect(gpointer data) { int rc = 0; if (timer_id_reconnect > 0) { g_source_remove(timer_id_reconnect); } rc = cib_connect(TRUE); if (rc != 0) { cl_log(LOG_WARNING, "CIB reconnect failed: %d", rc); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } else { cl_log(LOG_INFO, "CIB reconnect successful"); } return FALSE; } static void mon_cib_connection_destroy(gpointer user_data) { if (cib) { cib->cmds->signoff(cib); /* retrigger as last one might have been skipped */ mon_refresh_state(NULL); if ((pcmk_clean_shutdown) && (!sync_resource_startup)) { /* assume a graceful pacemaker-shutdown */ clean_up(EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN); } /* getting here we aren't sure about the pacemaker-state so try to use the timeout to reconnect and get everything sorted out again */ pcmk_shutdown = 0; set_servant_health(pcmk_health_transient, LOG_WARNING, "Disconnected from CIB"); timer_id_reconnect = g_timeout_add(reconnect_msec, mon_timer_reconnect, NULL); } cib_connected = 0; /* no sense in looking into outdated cib, trying to apply patch, ... */ if (current_cib) { free_xml(current_cib); current_cib = NULL; } return; } static void mon_retrieve_current_cib() { xmlNode *xml_cib = NULL; int options = cib_scope_local | cib_sync_call; int rc = pcmk_ok; const char* element_name; free_xml(current_cib); current_cib = NULL; rc = cib->cmds->query(cib, NULL, &xml_cib, options); if (rc != pcmk_ok) { crm_err("Couldn't retrieve the CIB: %s (%d)", pcmk_strerror(rc), rc); free_xml(xml_cib); return; } else if (xml_cib == NULL) { crm_err("Couldn't retrieve the CIB: empty result"); return; } element_name = crm_element_name(xml_cib); if (element_name && !strcmp(element_name, XML_TAG_CIB)) { current_cib = xml_cib; } else { free_xml(xml_cib); } return; } static gboolean mon_timer_notify(gpointer data) { static int counter = 0; int counter_max = timeout_watchdog / timeout_loop / 2; if (timer_id_notify > 0) { g_source_remove(timer_id_notify); } #if USE_PACEMAKERD_API { time_t now = time(NULL); if ((last_ok <= now) && (now - last_ok < timeout_watchdog)) { #endif if (cib_connected) { if (counter == counter_max) { mon_retrieve_current_cib(); mon_refresh_state(NULL); counter = 0; } else { cib->cmds->noop(cib, 0); notify_parent(); counter++; } } #if USE_PACEMAKERD_API } } if (pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main) == pcmk_rc_ok) { pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name); } #endif timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); return FALSE; } /* * Mainloop signal handler. */ static void mon_shutdown(int nsig) { clean_up(0); } static int cib_connect(gboolean full) { int rc = 0; CRM_CHECK(cib != NULL, return -EINVAL); cib_connected = 0; crm_xml_init(); if (cib->state != cib_connected_query && cib->state != cib_connected_command) { rc = cib->cmds->signon(cib, crm_system_name, cib_query); if (rc != 0) { return rc; } mon_retrieve_current_cib(); mon_refresh_state(NULL); if (full) { if (rc == 0) { rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy); if (rc == -EPROTONOSUPPORT) { /* Notification setup failed, won't be able to reconnect after failure */ rc = 0; } } if (rc == 0) { cib->cmds->del_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); rc = cib->cmds->add_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); } if (rc != 0) { /* Notification setup failed, could not monitor CIB actions */ clean_up(-rc); } } } if (!rc) { cib_connected = 1; } return rc; } static void compute_status(pe_working_set_t * data_set) { static int updates = 0; static int ever_had_quorum = FALSE; node_t *node = NULL; updates++; if (data_set->dc_node == NULL) { set_servant_health(pcmk_health_transient, LOG_INFO, "We don't have a DC right now."); notify_parent(); return; } node = pe_find_node(data_set->nodes, local_uname); if ((node == NULL) || (node->details == NULL)) { set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: %s is UNKNOWN", local_uname); notify_parent(); return; } if (node->details->online == FALSE) { set_servant_health(pcmk_health_unknown, LOG_WARNING, "Node state: OFFLINE"); } else if (node->details->unclean) { set_servant_health(pcmk_health_unclean, LOG_WARNING, "Node state: UNCLEAN"); } else if (node->details->pending) { set_servant_health(pcmk_health_pending, LOG_WARNING, "Node state: pending"); } else if (data_set->flags & pe_flag_have_quorum) { set_servant_health(pcmk_health_online, LOG_INFO, "Node state: online"); ever_had_quorum = TRUE; } else if(disk_count > 0) { set_servant_health(pcmk_health_noquorum, LOG_WARNING, "Quorum lost"); } else if(ever_had_quorum == FALSE) { set_servant_health(pcmk_health_online, LOG_INFO, "We do not have quorum yet"); } else { /* We lost quorum, and there are no disks present * Setting healthy > 2 here will result in us self-fencing */ switch (data_set->no_quorum_policy) { case no_quorum_freeze: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Freeze resources"); break; #if HAVE_ENUM_NO_QUORUM_DEMOTE case no_quorum_demote: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Demote promotable resources and stop others"); break; #endif case no_quorum_stop: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Stop ALL resources"); break; case no_quorum_ignore: set_servant_health(pcmk_health_transient, LOG_INFO, "Quorum lost: Ignore"); break; default: /* immediate reboot is the most excessive action we take use for no_quorum_suicide and everything we don't know yet */ set_servant_health(pcmk_health_unclean, LOG_INFO, "Quorum lost: Self-fence"); break; } } /* If we are in shutdown-state once this will go on till the end. * If we've on top reached a state of 0 locally running resources * we can assume a clean shutdown. * Tricky are the situations where the node is in maintenance-mode * or resources are unmanaged. So if the node is in maintenance or * all left-over running resources are unmanaged we assume intention. */ if (node->details->shutdown) { pcmk_shutdown = 1; } if (pcmk_shutdown) { pcmk_clean_shutdown = 1; if (!(node->details->maintenance)) { GListPtr iter; for (iter = node->details->running_rsc; iter != NULL; iter = iter->next) { resource_t *rsc = (resource_t *) iter->data; if (is_set(rsc->flags, pe_rsc_managed)) { pcmk_clean_shutdown = 0; crm_debug("not clean as %s managed and still running", rsc->id); break; } } if (pcmk_clean_shutdown) { crm_debug("pcmk_clean_shutdown because " "all managed resources down"); } } else { crm_debug("pcmk_clean_shutdown because node is in maintenance"); } } notify_parent(); return; } static crm_trigger_t *refresh_trigger = NULL; static gboolean mon_trigger_refresh(gpointer user_data) { mainloop_set_trigger(refresh_trigger); mon_refresh_state(NULL); return FALSE; } #define XPATH_SHUTDOWN "//" XML_CIB_TAG_STATE "[@uname='%s']/" \ XML_TAG_TRANSIENT_NODEATTRS "/" XML_TAG_ATTR_SETS "/" \ XML_CIB_TAG_NVPAIR "[@name='" XML_CIB_ATTR_SHUTDOWN "']" static gboolean shutdown_attr_in_cib(void) { xmlNode *match = NULL; char *xpath_string; xpath_string = crm_strdup_printf(XPATH_SHUTDOWN, local_uname); if (xpath_string) { match = get_xpath_object(xpath_string, current_cib, LOG_TRACE); free(xpath_string); } return (match != NULL); } static void crm_diff_update(const char *event, xmlNode * msg) { int rc = -1; const char *op = NULL; long now = time(NULL); static int updates = 0; static mainloop_timer_t *refresh_timer = NULL; if(refresh_timer == NULL) { refresh_timer = mainloop_timer_add("refresh", reconnect_msec, FALSE, mon_trigger_refresh, NULL); refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_state, refresh_timer); } if (current_cib != NULL) { xmlNode *cib_last = current_cib; current_cib = NULL; rc = cib_apply_patch_event(msg, cib_last, ¤t_cib, LOG_DEBUG); free_xml(cib_last); switch(rc) { case -pcmk_err_diff_resync: case -pcmk_err_diff_failed: crm_warn("[%s] %s Patch aborted: %s (%d)", event, op, pcmk_strerror(rc), rc); break; case pcmk_ok: updates++; break; default: crm_notice("[%s] %s ABORTED: %s (%d)", event, op, pcmk_strerror(rc), rc); break; } } if (current_cib == NULL) { mon_retrieve_current_cib(); } /* Refresh * - immediately if the last update was more than 1s ago * - every 10 updates * - at most 1s after the last update * - shutdown attribute for our node set for the first time */ if ((!pcmk_shutdown && shutdown_attr_in_cib()) || (updates > 10 || (now - last_refresh) > (reconnect_msec / 1000))) { mon_refresh_state(refresh_timer); updates = 0; } else { mainloop_set_trigger(refresh_trigger); mainloop_timer_start(refresh_timer); } } static gboolean mon_refresh_state(gpointer user_data) { xmlNode *cib_copy = NULL; if(current_cib == NULL) { return FALSE; } if(user_data) { mainloop_timer_t *timer = user_data; mainloop_timer_stop(timer); } cib_copy = copy_xml(current_cib); if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) { cl_log(LOG_WARNING, "cli_config_update() failed - forcing reconnect to CIB"); if (cib) { cib->cmds->signoff(cib); } } else { last_refresh = time(NULL); data_set->input = cib_copy; data_set->flags |= pe_flag_have_stonith_resource; cluster_status(data_set); compute_status(data_set); pe_reset_working_set(data_set); } return FALSE; } static void clean_up(int rc) { if (timer_id_reconnect > 0) { g_source_remove(timer_id_reconnect); timer_id_reconnect = 0; } if (timer_id_notify > 0) { g_source_remove(timer_id_notify); timer_id_notify = 0; } if (data_set != NULL) { pe_free_working_set(data_set); data_set = NULL; } if (cib != NULL) { cib->cmds->signoff(cib); cib_delete(cib); cib = NULL; } #if USE_PACEMAKERD_API if (pacemakerd_api != NULL) { pcmk_ipc_api_t *capi = pacemakerd_api; pacemakerd_api = NULL; // Ensure we can't free this twice pcmk_free_ipc_api(capi); } #endif if (rc >= 0) { exit(rc); } return; } int servant_pcmk(const char *diskname, int mode, const void* argp) { int exit_code = 0; crm_system_name = strdup("sbd:pcmk"); cl_log(LOG_NOTICE, "Monitoring Pacemaker health"); set_proc_title("sbd: watcher: Pacemaker"); setenv("PCMK_watchdog", "true", 1); if(debug == 0) { /* We don't want any noisy crm messages */ set_crm_log_level(LOG_CRIT); } if (data_set == NULL) { data_set = pe_new_working_set(); } if (data_set == NULL) { return -1; } #if USE_PACEMAKERD_API { int rc; rc = pcmk_new_ipc_api(&pacemakerd_api, pcmk_ipc_pacemakerd); if (pacemakerd_api == NULL) { cl_log(LOG_ERR, "Could not connect to pacemakerd: %s\n", pcmk_rc_str(rc)); return -1; } pcmk_register_ipc_callback(pacemakerd_api, pacemakerd_event_cb, NULL); do { rc = pcmk_connect_ipc(pacemakerd_api, pcmk_ipc_dispatch_main); if (rc != pcmk_rc_ok) { cl_log(LOG_DEBUG, "Could not connect to pacemakerd: %s\n", pcmk_rc_str(rc)); sleep(reconnect_msec / 1000); } } while (rc != pcmk_rc_ok); /* send a ping to pacemakerd to wake it up */ pcmk_pacemakerd_api_ping(pacemakerd_api, crm_system_name); /* cib should come up now as well so it's time * to have the inquisitor have a closer look */ notify_parent(); } #endif if (current_cib == NULL) { cib = cib_new(); do { exit_code = cib_connect(TRUE); if (exit_code != 0) { sleep(reconnect_msec / 1000); } } while (exit_code == -ENOTCONN); if (exit_code != 0) { clean_up(-exit_code); } } mainloop = g_main_loop_new(NULL, FALSE); mainloop_add_signal(SIGTERM, mon_shutdown); mainloop_add_signal(SIGINT, mon_shutdown); timer_id_notify = g_timeout_add(timeout_loop * 1000, mon_timer_notify, NULL); g_main_loop_run(mainloop); g_main_loop_unref(mainloop); clean_up(0); return 0; /* never reached */ } sbd-1.5.1/src/sbd.h000066400000000000000000000152071414464774600140240ustar00rootroot00000000000000/* * Copyright (C) 2013 Lars Marowsky-Bree * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* signals reserved for multi-disk sbd */ #define SIG_LIVENESS (SIGRTMIN + 1) /* report liveness of the disk */ #define SIG_EXITREQ (SIGRTMIN + 2) /* exit request to inquisitor */ #define SIG_TEST (SIGRTMIN + 3) /* trigger self test */ #define SIG_RESTART (SIGRTMIN + 4) /* trigger restart of all failed disk */ #define SIG_PCMK_UNHEALTHY (SIGRTMIN + 5) /* FIXME: should add dynamic check of SIG_XX >= SIGRTMAX */ /* exit status for disk-servant */ #define EXIT_MD_SERVANT_IO_FAIL 20 #define EXIT_MD_SERVANT_REQUEST_RESET 21 #define EXIT_MD_SERVANT_REQUEST_SHUTOFF 22 #define EXIT_MD_SERVANT_REQUEST_CRASHDUMP 23 /* exit status for pcmk-servant */ #define EXIT_PCMK_SERVANT_GRACEFUL_SHUTDOWN 30 #define HOG_CHAR 0xff #define SECTOR_NAME_MAX 63 /* Sector data types */ struct sector_header_s { char magic[8]; unsigned char version; unsigned char slots; /* Caveat: stored in network byte-order */ uint32_t sector_size; uint32_t timeout_watchdog; uint32_t timeout_allocate; uint32_t timeout_loop; uint32_t timeout_msgwait; /* Minor version for extensions to the core data set: * compatible and optional values. */ unsigned char minor_version; uuid_t uuid; /* 16 bytes */ }; struct sector_mbox_s { signed char cmd; char from[SECTOR_NAME_MAX+1]; }; struct sector_node_s { /* slots will be created with in_use == 0 */ char in_use; char name[SECTOR_NAME_MAX+1]; }; struct servants_list_item { const char* devname; pid_t pid; int restarts; int restart_blocked; int outdated; int first_start; struct timespec t_last, t_started; struct servants_list_item *next; }; struct sbd_context { int devfd; io_context_t ioctx; struct iocb io; void *buffer; }; enum pcmk_health { pcmk_health_unknown, pcmk_health_pending, pcmk_health_transient, pcmk_health_unclean, pcmk_health_shutdown, pcmk_health_online, pcmk_health_noquorum, }; void usage(void); int watchdog_init_interval(void); int watchdog_tickle(void); int watchdog_init(void); void sysrq_init(void); void watchdog_close(bool disarm); int watchdog_info(void); int watchdog_test(void); void sysrq_trigger(char t); void do_crashdump(void); void do_reset(void); void do_off(void); void do_timeout_action(void); pid_t make_daemon(void); void maximize_priority(void); void sbd_get_uname(void); void sbd_set_format_string(int method, const char *daemon); void notify_parent(void); /* Tunable defaults: */ extern unsigned long timeout_watchdog; extern unsigned long timeout_watchdog_warn; extern bool do_calculate_timeout_watchdog_warn; extern unsigned long timeout_watchdog_crashdump; extern int timeout_allocate; extern int timeout_loop; extern int timeout_msgwait; extern int timeout_io; extern int timeout_startup; extern int watchdog_use; extern int watchdog_set_timeout; extern int skip_rt; extern int debug; extern int debug_mode; extern char *watchdogdev; extern bool watchdogdev_is_default; extern char* local_uname; extern bool do_flush; extern char timeout_sysrq_char; extern bool move_to_root_cgroup; extern bool enforce_moving_to_root_cgroup; extern bool sync_resource_startup; /* Global, non-tunable variables: */ extern int sector_size; extern int watchdogfd; extern const char* cmdname; typedef int (*functionp_t)(const char* devname, int mode, const void* argp); int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp); #if SUPPORT_SHARED_DISK void open_any_device(struct servants_list_item *servants); int init_devices(struct servants_list_item *servants); int allocate_slots(const char *name, struct servants_list_item *servants); int list_slots(struct servants_list_item *servants); int ping_via_slots(const char *name, struct servants_list_item *servants); int dump_headers(struct servants_list_item *servants); unsigned long get_first_msgwait(struct servants_list_item *servants); int messenger(const char *name, const char *msg, struct servants_list_item *servants); int servant_md(const char *diskname, int mode, const void* argp); #endif int servant_pcmk(const char *diskname, int mode, const void* argp); int servant_cluster(const char *diskname, int mode, const void* argp); struct servants_list_item *lookup_servant_by_dev(const char *devname); struct servants_list_item *lookup_servant_by_pid(pid_t pid); int init_set_proc_title(int argc, char *argv[], char *envp[]); void set_proc_title(const char *fmt,...); #define cl_log(level, fmt, args...) qb_log_from_external_source( __func__, __FILE__, fmt, level, __LINE__, 0, ##args) # define cl_perror(fmt, args...) do { \ const char *err = strerror(errno); \ cl_log(LOG_ERR, fmt ": %s (%d)", ##args, err, errno); \ } while(0) #define DBGLOG(lvl, fmt, args...) do { \ if (debug > 0) cl_log(lvl, fmt, ##args); \ } while(0) extern int servant_health; void set_servant_health(enum pcmk_health state, int level, char const *format, ...) __attribute__ ((__format__ (__printf__, 3, 4))); bool sbd_is_disk(struct servants_list_item *servant); bool sbd_is_pcmk(struct servants_list_item *servant); bool sbd_is_cluster(struct servants_list_item *servant); #define calculate_timeout_watchdog_warn(timeout) \ (timeout < 5 ? 2 : \ (timeout < (ULONG_MAX / 3) ? \ (((unsigned long) timeout) * 3 / 5) : (((unsigned long) timeout) / 5 * 3))) sbd-1.5.1/src/sbd.service.in000066400000000000000000000013721414464774600156400ustar00rootroot00000000000000[Unit] Description=Shared-storage based fencing daemon Documentation=man:sbd(8) Before=pacemaker.service Before=dlm.service After=systemd-modules-load.service iscsi.service PartOf=corosync.service RefuseManualStop=true RefuseManualStart=true [Service] Type=forking PIDFile=@runstatedir@/sbd.pid EnvironmentFile=-@CONFIGDIR@/sbd ExecStart=@sbindir@/sbd $SBD_OPTS -p @runstatedir@/sbd.pid watch ExecStop=@bindir@/kill -TERM $MAINPID # Could this benefit from exit codes for restart? # Does this need to be set to msgwait * 1.2? # TimeoutSec= # If SBD crashes, it'll very likely suicide immediately due to the # hardware watchdog. But one can always try. Restart=on-abort [Install] RequiredBy=corosync.service RequiredBy=pacemaker.service RequiredBy=dlm.service sbd-1.5.1/src/sbd.sh.in000066400000000000000000000072711414464774600146160ustar00rootroot00000000000000#!/bin/bash # # Copyright (C) 2013 Lars Marowsky-Bree # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # ### BEGIN INIT INFO # Provides: sbd # Required-Start: $network $remote_fs # Should-Start: $syslog iscsi multipath-tools corosync # X-Start-Before: pacemaker # Required-Stop: $network $remote_fs # Should-Stop: iscsi multipath-tools corosync # X-Stop-After: pacemaker # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Short-Description: Shared-storage based fencing daemon # Description: SBD provides a node fencing mechanism for # Pacemaker-based clusters through the exchange of # messages via shared block storage such as for # example a SAN, iSCSI, FCoE. It can be used as # a STONITH mechanism in all configurations that # have reliable shared storage. ### END INIT INFO SBD_CONFIG=@CONFIGDIR@/sbd SBD_BIN="/usr/sbin/sbd" [ -e /lib/lsb/init-functions ] && . /lib/lsb/init-functions test -x $SBD_BIN || exit 1 test -f $SBD_CONFIG || exit 1 . $SBD_CONFIG unset LC_ALL; export LC_ALL unset LANGUAGE; export LANGUAGE : ${OCF_ROOT:=/usr/lib/ocf} : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Construct commandline for some common options if [ -z "$SBD_DEVICE" ]; then echo "No sbd devices defined" exit 1 fi SBD_DEVS=${SBD_DEVICE%;} SBD_DEVICE_ARGS="-d ${SBD_DEVS//;/ -d }" : ${SBD_PIDFILE:=@runstatedir@/sbd.pid} SBD_OPTS+=" -p $SBD_PIDFILE" : ${SBD_PACEMAKER:="true"} if ocf_is_true "$SBD_PACEMAKER" ; then SBD_OPTS+=" -P" fi : ${SBD_WATCHDOG:="true"} if ! ocf_is_true "$SBD_WATCHDOG" ; then SBD_OPTS+=" -W -W" fi if [ -n "$SBD_WATCHDOG_DEV" ]; then SBD_OPTS+=" -w $SBD_WATCHDOG_DEV" fi : ${SBD_STARTMODE:="always"} case "$SBD_STARTMODE" in always) SBD_OPTS+=" -S 0" ;; clean) SBD_OPTS+=" -S 1" ;; esac : ${SBD_DELAY_START:="no"} start() { if ! pidofproc -p $SBD_PIDFILE $SBD_BIN >/dev/null 2>&1 ; then if ! $SBD_BIN $SBD_DEVICE_ARGS $SBD_OPTS watch ; then echo "SBD failed to start; aborting." exit 1 fi if ocf_is_true ${SBD_DELAY_START} ; then sleep $($SBD_BIN $SBD_DEVICE_ARGS dump | grep -m 1 msgwait | awk '{print $4}') 2>/dev/null fi else return 0 fi } stop() { if ! $SBD_BIN $SBD_DEVICE_ARGS -D $SBD_OPTS message LOCAL exit ; then echo "SBD failed to stop; aborting." exit 1 fi while pidofproc -p $SBD_PIDFILE $SBD_BIN >/dev/null 2>&1 ; do sleep 1 done } status() { if pidofproc -p $SBD_PIDFILE $SBD_BIN >/dev/null 2>&1 ; then echo "SBD is running." return 0 else echo "SBD is not running." return 1 fi } case "$1" in start|stop|status) $1 ;; restart|force-reload) stop; start ;; *) echo "Usage: $0 (start|stop|status|restart|force-reload)" exit 1 ;; esac # TODO: # - Make openais init script call out to this script too # - How to handle the former "force-start" option? # force-start) # SBD_OPTS="$SBD_OPTS -S 0" # start # ;; sbd-1.5.1/src/sbd.sysconfig.in000066400000000000000000000103461414464774600162050ustar00rootroot00000000000000## Type: string ## Default: "" # # SBD_DEVICE specifies the devices to use for exchanging sbd messages # and to monitor. If specifying more than one path, use ";" as # separator. # #SBD_DEVICE="" ## Type: yesno ## Default: yes # # Whether to enable the pacemaker integration. # SBD_PACEMAKER=yes ## Type: always / clean ## Default: always # # Specify the start mode for sbd. Setting this to "clean" will only # allow sbd to start if it was not previously fenced. See the -S option # in the man page. # SBD_STARTMODE=always ## Type: yesno / integer ## Default: no # # Whether to delay after starting sbd on boot for "msgwait" seconds. # This may be necessary if your cluster nodes reboot so fast that the # other nodes are still waiting in the fence acknowledgement phase. # This is an occasional issue with virtual machines. # # This can also be enabled by being set to a specific delay value, in # seconds. Sometimes a longer delay than the default, "msgwait", is # needed, for example in the cases where it's considered to be safer to # wait longer than: # corosync token timeout + consensus timeout + pcmk_delay_max + msgwait # # Be aware that the special value "1" means "yes" rather than "1s". # # Consider that you might have to adapt the startup-timeout accordingly # if the default isn't sufficient. (TimeoutStartSec for systemd) # # This option may be ignored at a later point, once pacemaker handles # this case better. # SBD_DELAY_START=no ## Type: string ## Default: /dev/watchdog # # Watchdog device to use. If set to /dev/null, no watchdog device will # be used. # SBD_WATCHDOG_DEV=/dev/watchdog ## Type: integer ## Default: @SBD_WATCHDOG_TIMEOUT_DEFAULT@ # # How long, in seconds, the watchdog will wait before panicking the # node if no-one tickles it. # # This depends mostly on your storage latency; the majority of devices # must be successfully read within this time, or else the node will # self-fence. # # If your sbd device(s) reside on a multipath setup or iSCSI, this # should be the time required to detect a path failure. # # Be aware that watchdog timeout set in the on-disk metadata takes # precedence. # SBD_WATCHDOG_TIMEOUT=@SBD_WATCHDOG_TIMEOUT_DEFAULT@ ## Type: string ## Default: "flush,reboot" # # Actions to be executed when the watchers don't timely report to the sbd # master process or one of the watchers detects that the master process # has died. # # Set timeout-action to comma-separated combination of # noflush|flush plus reboot|crashdump|off. # If just one of both is given the other stays at the default. # # This doesn't affect actions like off, crashdump, reboot explicitly # triggered via message slots. # And it does as well not configure the action a watchdog would # trigger should it run off (there is no generic interface). # SBD_TIMEOUT_ACTION=flush,reboot ## Type: yesno / auto ## Default: auto # # If CPUAccounting is enabled default is not to assign any RT-budget # to the system.slice which prevents sbd from running RR-scheduled. # # One way to escape that issue is to move sbd-processes from the # slice they were originally started to root-slice. # Of course starting sbd in a certain slice might be intentional. # Thus in auto-mode sbd will check if the slice has RT-budget assigned. # If that is the case sbd will stay in that slice while it will # be moved to root-slice otherwise. # SBD_MOVE_TO_ROOT_CGROUP=auto ## Type: yesno ## Default: @SBD_SYNC_RESOURCE_STARTUP_DEFAULT@ # # If resource startup syncing is enabled then pacemakerd is # gonna wait to be pinged via IPC before it starts resources. # On shutdown pacemakerd is going to wait in a state where it # has cleanly shutdown resources till sbd fetches that state. # # The default is set when building SBD and Pacemaker from source. # Going for 'no' is safer if it can't be assured that SBD and # Pacemaker installed do both support the synchronization feature. # When going with 'yes' - also using package dependencies to # assure SBD & Pacemaker both support the synchronization # feature and are assuming the same default - an SBD configuration # inherited via an upgrade doesn't have to be altered to still # benefit from the new feature. # SBD_SYNC_RESOURCE_STARTUP=@SBD_SYNC_RESOURCE_STARTUP_SYSCONFIG@ ## Type: string ## Default: "" # # Additional options for starting sbd # SBD_OPTS= sbd-1.5.1/src/sbd_remote.service.in000066400000000000000000000013321414464774600172070ustar00rootroot00000000000000[Unit] Description=Shared-storage based fencing daemon on pacemaker remote node Documentation=man:sbd(8) After=systemd-modules-load.service iscsi.service PartOf=pacemaker_remote.service RefuseManualStop=true RefuseManualStart=true [Service] Type=forking PIDFile=@runstatedir@/sbd.pid EnvironmentFile=-@CONFIGDIR@/sbd ExecStart=@sbindir@/sbd $SBD_OPTS -p @runstatedir@/sbd.pid watch ExecStop=@bindir@/kill -TERM $MAINPID # Could this benefit from exit codes for restart? # Does this need to be set to msgwait * 1.2? # TimeoutSec= # If SBD crashes, it'll very likely suicide immediately due to the # hardware watchdog. But one can always try. Restart=on-abort [Install] RequiredBy=pacemaker_remote.service RequiredBy=dlm.service sbd-1.5.1/src/setproctitle.c000066400000000000000000000140211414464774600157610ustar00rootroot00000000000000/* * setproctitle.c * * The code in this file, setproctitle.c is heavily based on code from * proftpd, please see the licening information below. * * This file added to the heartbeat tree by Horms * * Code to portably change the title of a programme as displayed * by ps(1). * * heartbeat: Linux-HA heartbeat code * * Copyright (C) 1999,2000,2001 Alan Robertson * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ /* * ProFTPD - FTP server daemon * Copyright (c) 1997, 1998 Public Flood Software * Copyright (C) 1999, 2000 MacGyver aka Habeeb J. Dihu * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. * * As a special exemption, Public Flood Software/MacGyver aka Habeeb J. Dihu * and other respective copyright holders give permission to link this program * with OpenSSL, and distribute the resulting executable, without including * the source code for OpenSSL in the source distribution. */ #include #include #include #include #include #define PF_ARGV_NONE 0 #define PF_ARGV_NEW 1 #define PF_ARGV_WRITEABLE 2 #define PF_ARGV_PSTAT 3 #define PF_ARGV_PSSTRINGS 4 #if PF_ARGV_TYPE == PF_ARGV_PSTAT # include #endif #include #if PF_ARGV_TYPE != PF_ARGV_NONE static char **Argv = NULL; static char *LastArgv = NULL; #endif /* PF_ARGV_TYPE != PF_ARGV_NONE */ extern char **environ; #ifdef HAVE___PROGNAME extern char *__progname; extern char *__progname_full; #endif /* HAVE___PROGNAME */ int init_set_proc_title(int argc, char *argv[], char *envp[]) { #if PF_ARGV_TYPE == PF_ARGV_NONE return 0; #else int i; int envpsize; char **p; /* Move the environment so setproctitle can use the space. */ for(i = envpsize = 0; envp[i] != NULL; i++) { envpsize += strlen(envp[i]) + 1; } p = (char **) malloc((i + 1) * sizeof(char *)); if (p == NULL) { return -1; } environ = p; for(i = 0; envp[i] != NULL; i++) { environ[i] = strdup(envp[i]); if(environ[i] == NULL) { goto error_environ; } } environ[i] = NULL; Argv = argv; for(i = 0; i < argc; i++) { if(!i || (LastArgv + 1 == argv[i])) LastArgv = argv[i] + strlen(argv[i]); } for(i = 0; envp[i] != NULL; i++) { if((LastArgv + 1) == envp[i]) { LastArgv = envp[i] + strlen(envp[i]); } } #ifdef HAVE___PROGNAME /* Set the __progname and __progname_full variables so glibc and * company don't go nuts. - MacGyver */ __progname = strdup("sbd"); if (__progname == NULL) { goto error_environ; } __progname_full = strdup(argv[0]); if (__progname_full == NULL) { goto error_environ; } #endif /* HAVE___PROGNAME */ return 0; error_environ: for(i = 0; environ[i] != NULL; i++) { free(environ[i]); } free(environ); return -1; #endif /* PF_ARGV_TYPE == PF_ARGV_NONE */ } void set_proc_title(const char *fmt,...) { #if PF_ARGV_TYPE != PF_ARGV_NONE va_list msg; static char statbuf[BUFSIZ]; #ifndef HAVE_SETPROCTITLE #if PF_ARGV_TYPE == PF_ARGV_PSTAT union pstun pst; #endif /* PF_ARGV_PSTAT */ int i,maxlen = (LastArgv - Argv[0]) - 2; char *p; #endif /* HAVE_SETPROCTITLE */ va_start(msg,fmt); memset(statbuf, 0, sizeof(statbuf)); #ifdef HAVE_SETPROCTITLE # if __FreeBSD__ >= 4 && !defined(FREEBSD4_0) && !defined(FREEBSD4_1) /* FreeBSD's setproctitle() automatically prepends the process name. */ vsnprintf(statbuf, sizeof(statbuf), fmt, msg); # else /* FREEBSD4 */ /* Manually append the process name for non-FreeBSD platforms. */ vsnprintf(statbuf + strlen(statbuf), sizeof(statbuf) - strlen(statbuf), fmt, msg); # endif /* FREEBSD4 */ setproctitle("%s", statbuf); #else /* HAVE_SETPROCTITLE */ /* Manually append the process name for non-setproctitle() platforms. */ vsnprintf(statbuf + strlen(statbuf), sizeof(statbuf) - strlen(statbuf), fmt, msg); #endif /* HAVE_SETPROCTITLE */ va_end(msg); #ifdef HAVE_SETPROCTITLE return; #else i = strlen(statbuf); #if PF_ARGV_TYPE == PF_ARGV_NEW /* We can just replace argv[] arguments. Nice and easy. */ Argv[0] = statbuf; Argv[1] = NULL; #endif /* PF_ARGV_NEW */ #if PF_ARGV_TYPE == PF_ARGV_WRITEABLE /* We can overwrite individual argv[] arguments. Semi-nice. */ snprintf(Argv[0], maxlen, "%s", statbuf); p = &Argv[0][i]; while(p < LastArgv) *p++ = '\0'; Argv[1] = NULL; #endif /* PF_ARGV_WRITEABLE */ #if PF_ARGV_TYPE == PF_ARGV_PSTAT pst.pst_command = statbuf; pstat(PSTAT_SETCMD, pst, i, 0, 0); #endif /* PF_ARGV_PSTAT */ #if PF_ARGV_TYPE == PF_ARGV_PSSTRINGS PS_STRINGS->ps_nargvstr = 1; PS_STRINGS->ps_argvstr = statbuf; #endif /* PF_ARGV_PSSTRINGS */ #endif /* HAVE_SETPROCTITLE */ #endif /* PF_ARGV_TYPE != PF_ARGV_NONE */ } sbd-1.5.1/tests/000077500000000000000000000000001414464774600134515ustar00rootroot00000000000000sbd-1.5.1/tests/Makefile.am000066400000000000000000000002151414464774600155030ustar00rootroot00000000000000lib_LTLIBRARIES = libsbdtestbed.la libsbdtestbed_la_SOURCES = sbd-testbed.c libsbdtestbed_la_LDFLAGS = libsbdtestbed_la_LIBADD = @LIBADD_DL@ sbd-1.5.1/tests/regressions.sh000077500000000000000000000203511414464774600163540ustar00rootroot00000000000000#!/bin/bash # # Copyright (C) 2013 Lars Marowsky-Bree # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # TODO: # - More tests # - Handle optional, long-running tests better # - Support for explicitly running a single test # - Verify output from commands # - Normalize uuids and device names so they are diffable # - Log to file, instead of syslog is needed # - How to test watch mode? # - Can the unit/service file be tested? or at least the wrapper? : ${SBD_BINARY:="/usr/sbin/sbd"} : ${SBD_PRELOAD="libsbdtestbed.so"} : ${SBD_USE_DM:="yes"} sbd() { LD_PRELOAD=${SBD_PRELOAD} SBD_DEVICE="${SBD_DEVICE}" SBD_PRELOAD_LOG=${SBD_PRELOAD_LOG} SBD_WATCHDOG_DEV=/dev/watchdog setsid ${SBD_BINARY} -p ${SBD_PIDFILE} "$@" } sbd_wipe_disk() { dd if=/dev/zero of=$1 count=2048 2>/dev/null } sbd_setup() { trap sbd_teardown EXIT for N in $(seq 3) ; do F[$N]=$(mktemp /tmp/sbd.device.$N.XXXXXX) sbd_wipe_disk ${F[$N]} if [[ "${SBD_USE_DM}" == "yes" ]]; then R[$N]=$(echo ${F[$N]}|cut -f4 -d.) L[$N]=$(losetup -f) losetup ${L[$N]} ${F[$N]} D[$N]="/dev/mapper/sbd_${N}_${R[$N]}" dmsetup create sbd_${N}_${R[$N]} --table "0 2048 linear ${L[$N]} 0" dmsetup mknodes sbd_${N}_${R[$N]} else D[$N]=${F[$N]} fi done if [[ "${SBD_USE_DM}" != "yes" ]]; then SBD_DEVICE="${F[1]};${F[2]};${F[3]}" fi SBD_PIDFILE=$(mktemp /tmp/sbd.pidfile.XXXXXX) SBD_PRELOAD_LOG=$(mktemp /tmp/sbd.logfile.XXXXXX) sbd -d ${D[1]} create WATCHDOG_TIMEOUT=$(LD_PRELOAD=${SBD_PRELOAD} SBD_DEVICE="${D[1]}" ${SBD_BINARY} dump |grep watchdog|cut -f2 -d:) MSGWAIT_TIMEOUT=$(LD_PRELOAD=${SBD_PRELOAD} SBD_DEVICE="${D[1]}" ${SBD_BINARY} dump |grep msgwait|cut -f2 -d:) } sbd_teardown() { for N in $(seq 3) ; do if [[ "${SBD_USE_DM}" == "yes" ]]; then dmsetup remove sbd_${N}_${R[$N]} losetup -d ${L[$N]} fi rm -f ${F[$N]} sbd_daemon_cleanup rm -f ${SBD_PIDFILE} rm -f ${SBD_PRELOAD_LOG} done } sbd_dev_fail() { if [[ "${SBD_USE_DM}" == "yes" ]]; then dmsetup wipe_table sbd_${1}_${R[$1]} else D[$1]=/tmp/fail123456789 fi } sbd_dev_resume() { if [[ "${SBD_USE_DM}" == "yes" ]]; then dmsetup suspend sbd_${1}_${R[$1]} dmsetup load sbd_${1}_${R[$1]} --table "0 2048 linear ${L[$1]} 0" dmsetup resume sbd_${1}_${R[$1]} else D[$1]=${F[$1]} fi } sbd_daemon_cleanup() { echo > ${SBD_PRELOAD_LOG} pkill -TERM --pidfile ${SBD_PIDFILE} 2>/dev/null sleep 5 pkill -KILL --pidfile ${SBD_PIDFILE} 2>/dev/null pkill -KILL --parent "$(cat ${SBD_PIDFILE} 2>/dev/null)" 2>/dev/null echo > ${SBD_PIDFILE} } _ok() { echo "-- $*" "$@" rc=$? if [ $rc -ne 0 ]; then echo "$* failed with $rc" exit $rc fi } _no() { echo "-- $*" "$@" rc=$? if [ $rc -eq 0 ]; then echo "$* did NOT fail ($rc)" exit $rc fi return 0 } _in_log() { grep "$@" ${SBD_PRELOAD_LOG} >/dev/null if [ $? -ne 0 ]; then echo "didn't find '$*' in log:" cat ${SBD_PRELOAD_LOG} sbd_daemon_cleanup exit 1 fi } test_1() { echo "Creating three devices" _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} create _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} dump } test_2() { echo "Basic functionality" for S in `seq 2` ; do _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} allocate "test-$S" done _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 message test-2 reset _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} list } test_3() { echo "Start mode (expected not to start, because reset was written in test_2)" _no sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-2 -Z -Z -Z -S 1 watch } test_4() { echo "Deliver message with 1 failure" sbd_dev_fail 1 _no sbd -d ${D[1]} -n test-1 message test-2 exit _no sbd -d ${D[1]} -d ${D[2]} -n test-1 message test-2 exit _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 message test-2 exit sbd_dev_resume 1 } test_5() { echo "Deliver message with 2 failures" sbd_dev_fail 1 sbd_dev_fail 2 _no sbd -d ${D[1]} -d ${D[2]} -n test-1 message test-2 exit _no sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 message test-2 exit sbd_dev_resume 1 sbd_dev_resume 2 } test_6() { echo "Deliver message with 3 failures" sbd_dev_fail 1 sbd_dev_fail 2 sbd_dev_fail 3 _no sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 message test-2 exit sbd_dev_resume 1 sbd_dev_resume 2 sbd_dev_resume 3 } test_101() { echo "Creating one device" _ok sbd -d ${D[1]} create } test_102() { echo "Creating two devices" _ok sbd -d ${D[1]} -d ${D[2]} create } test_7() { echo "Allocate all slots plus 1" _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -2 0 create for S in `seq 255` ; do _ok sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} allocate "test-$S" done _no sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} allocate "test-256" } test_8() { echo "Non-existent device path" _no sbd -d /dev/kfdifdifdfdlfd -create 2>/dev/null } test_9() { echo "Basic sbd invocation" _no sbd _ok sbd -h } test_watchdog() { echo "Basic watchdog test" echo > ${SBD_PRELOAD_LOG} sbd test-watchdog < /dev/null _in_log "watchdog fired" } test_stall_inquisitor() { echo "Stall inquisitor test" sbd_daemon_cleanup sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 watch sleep 10 _ok kill -0 "$(cat ${SBD_PIDFILE})" kill -STOP "$(cat ${SBD_PIDFILE})" sleep $((${WATCHDOG_TIMEOUT} * 2)) kill -CONT "$(cat ${SBD_PIDFILE})" 2>/dev/null _in_log "watchdog fired" } test_wipe_slots1() { echo "Wipe slots test (with watchdog)" sbd_daemon_cleanup sbd -d ${D[1]} -n test-1 watch sleep 2 sbd_wipe_disk ${D[1]} sleep $((${MSGWAIT_TIMEOUT} + ${WATCHDOG_TIMEOUT} * 2)) _in_log "watchdog fired" } test_wipe_slots2() { echo "Wipe slots test (without watchdog)" sbd_daemon_cleanup sbd -d ${D[1]} create sbd -d ${D[1]} -w /dev/null -n test-1 watch sleep 2 sbd_wipe_disk ${D[1]} sleep $((${MSGWAIT_TIMEOUT} + ${WATCHDOG_TIMEOUT} * 2)) _in_log "sysrq-trigger ('b')" _in_log "reboot (reboot)" } test_message1() { echo "Message test (reset)" sbd_daemon_cleanup sbd -d ${D[1]} create sbd -d ${D[1]} -w /dev/null -n test-1 watch sleep 2 sbd -d ${D[1]} message test-1 reset sleep 2 _in_log "sysrq-trigger ('b')" _in_log "reboot (reboot)" } test_message2() { echo "Message test (off)" sbd_daemon_cleanup sbd -d ${D[1]} create sbd -d ${D[1]} -w /dev/null -n test-1 watch sleep 2 sbd -d ${D[1]} message test-1 off sleep 2 _in_log "sysrq-trigger ('o')" _in_log "reboot (poweroff)" } test_message3() { echo "Message test (crashdump)" sbd_daemon_cleanup sbd -d ${D[1]} create sbd -d ${D[1]} -w /dev/null -n test-1 watch sleep 2 sbd -d ${D[1]} message test-1 crashdump sleep 2 _in_log "sysrq-trigger ('c')" } test_timeout_action1() { echo "Timeout action test (off)" sbd_daemon_cleanup sbd -d ${D[1]} create SBD_TIMEOUT_ACTION=off sbd -d ${D[1]} -w /dev/null -n test-1 watch sleep 2 sbd_wipe_disk ${D[1]} sleep $((${MSGWAIT_TIMEOUT} + ${WATCHDOG_TIMEOUT} * 2)) _in_log "sysrq-trigger ('o')" _in_log "reboot (poweroff)" } test_timeout_action2() { echo "Timeout action test (crashdump)" sbd_daemon_cleanup sbd -d ${D[1]} create SBD_TIMEOUT_ACTION=crashdump sbd -d ${D[1]} -w /dev/null -n test-1 watch sleep 2 sbd_wipe_disk ${D[1]} sleep $((${MSGWAIT_TIMEOUT} + ${WATCHDOG_TIMEOUT} * 2)) _in_log "sysrq-trigger ('c')" } sbd_setup _ok test "${WATCHDOG_TIMEOUT}" -eq "${WATCHDOG_TIMEOUT}" _ok test "${MSGWAIT_TIMEOUT}" -eq "${MSGWAIT_TIMEOUT}" echo "running sbd-tests with WATCHDOG_TIMEOUT=${WATCHDOG_TIMEOUT}s MSGWAIT_TIMEOUT=${MSGWAIT_TIMEOUT}s" if [[ "${SBD_PRELOAD}" != "" ]]; then SBD_DAEMON_TESTS="watchdog stall_inquisitor wipe_slots1 wipe_slots2 message1 message2 message3 timeout_action1 timeout_action2" fi for T in 101 102 $(seq 9) ${SBD_DAEMON_TESTS}; do if ! test_$T ; then echo "FAILURE: Test $T" break fi echo "SUCCESS: Test $T" done echo "SUCCESS: All tests completed" sbd-1.5.1/tests/sbd-testbed.c000066400000000000000000000464771414464774600160370ustar00rootroot00000000000000#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if __GLIBC_PREREQ(2,36) #include #else #include typedef gboolean (*GUnixFDSourceFunc) (gint fd, GIOCondition condition, gpointer user_data); static gboolean GIOFunc2GUnixFDSourceFunc(GIOChannel *source, GIOCondition condition, gpointer data) { return ((GUnixFDSourceFunc) data) ( g_io_channel_unix_get_fd(source), condition, NULL); } static guint g_unix_fd_add(gint fd, GIOCondition condition, GUnixFDSourceFunc function, gpointer user_data) { GIOChannel *chan = g_io_channel_unix_new (fd); if (chan == NULL) { return 0; } else { return g_io_add_watch(chan, condition, GIOFunc2GUnixFDSourceFunc, (gpointer) function); } } #endif typedef int (*orig_open_f_type)(const char *pathname, int flags, ...); typedef int (*orig_ioctl_f_type)(int fd, unsigned long int request, ...); typedef ssize_t (*orig_write_f_type)(int fd, const void *buf, size_t count); typedef int (*orig_close_f_type)(int fd); typedef FILE *(*orig_fopen_f_type)(const char *pathname, const char *mode); typedef int (*orig_fclose_f_type)(FILE *fp); typedef int (*orig_io_setup_f_type)(int nr_events, io_context_t *ctx_idp); typedef int (*orig_io_destroy_f_type)(io_context_t ctx_id); typedef int (*orig_io_submit_f_type)(io_context_t ctx_id, long nr, struct iocb *ios[]); typedef int (*orig_io_getevents_f_type)(io_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout); typedef int (*orig_io_cancel_f_type)(io_context_t ctx_id, struct iocb *iocb, struct io_event *result); static int is_init = 0; static FILE *log_fp = NULL; static char *sbd_device[3] = {NULL, NULL, NULL}; static int sbd_device_fd[3] = {-1, -1, -1}; static FILE *sysrq_fp = NULL; static FILE *sysrq_trigger_fp = NULL; static char *watchdog_device = NULL; static int watchdog_device_fd = -1; static int watchdog_timeout = -1; static pid_t watchdog_pid = -1; static int watchdog_pipe[2] = {-1, -1}; static guint watchdog_source_id = 0; static int watchdog_timer_id = 0; static orig_open_f_type orig_open = NULL; static orig_ioctl_f_type orig_ioctl = NULL; static orig_write_f_type orig_write = NULL; static orig_close_f_type orig_close = NULL; static orig_fopen_f_type orig_fopen = NULL; static orig_fclose_f_type orig_fclose = NULL; static orig_io_setup_f_type orig_io_setup = NULL; static orig_io_destroy_f_type orig_io_destroy = NULL; static orig_io_submit_f_type orig_io_submit = NULL; static orig_io_getevents_f_type orig_io_getevents = NULL; static orig_io_cancel_f_type orig_io_cancel = NULL; /* fprintf is inlined as __fprintf_chk or * we have vfprintf. * For fscanf we have vfscanf. * For reboot we anyway don't want that to be * called in any case. */ static struct iocb *pending_iocb = NULL; struct io_context { int context_num; }; static struct io_context our_io_context = {.context_num = 1}; static int translate_aio = 0; static GMainLoop *mainloop = NULL; #if 0 static void watchdog_shutdown(int nsig) { if (watchdog_timer_id > 0) { fprintf(log_fp, "exiting with watchdog-timer armed\n"); } } #endif static void* dlsym_fatal(void *handle, const char *symbol) { void *rv = dlsym(handle, symbol); if (!rv) { fprintf(stderr, "Failed looking up symbol %s\n", symbol); exit(1); } return rv; } static void init (void) { void *handle; if (!is_init) { const char *value; int i; char *token, *str, *str_orig; is_init = 1; orig_open = (orig_open_f_type)dlsym_fatal(RTLD_NEXT,"open"); orig_ioctl = (orig_ioctl_f_type)dlsym_fatal(RTLD_NEXT,"ioctl"); orig_close = (orig_close_f_type)dlsym_fatal(RTLD_NEXT,"close"); orig_write = (orig_write_f_type)dlsym_fatal(RTLD_NEXT,"write"); orig_fopen = (orig_fopen_f_type)dlsym_fatal(RTLD_NEXT,"fopen"); orig_fclose = (orig_fclose_f_type)dlsym_fatal(RTLD_NEXT,"fclose"); handle = dlopen("libaio.so.1", RTLD_NOW); if (!handle) { fprintf(stderr, "Failed opening libaio.so.1\n"); exit(1); } orig_io_setup = (orig_io_setup_f_type)dlsym_fatal(handle,"io_setup"); orig_io_destroy = (orig_io_destroy_f_type)dlsym_fatal(handle,"io_destroy"); orig_io_submit = (orig_io_submit_f_type)dlsym_fatal(handle,"io_submit"); orig_io_getevents = (orig_io_getevents_f_type)dlsym_fatal(handle,"io_getevents"); orig_io_cancel = (orig_io_cancel_f_type)dlsym_fatal(handle,"io_cancel"); dlclose(handle); value = getenv("SBD_PRELOAD_LOG"); if (value) { log_fp = fopen(value, "a"); } else { int fd = dup(fileno(stderr)); if (fd >= 0) { log_fp = fdopen(fd, "w"); } } if (log_fp == NULL) { fprintf(stderr, "couldn't open log-file\n"); } value = getenv("SBD_WATCHDOG_DEV"); if (value) { watchdog_device = strdup(value); } value = getenv("SBD_DEVICE"); if ((value) && (str = str_orig = strdup(value))) { for (i = 0; i < 3; i++, str = NULL) { token = strtok(str, ";"); if (token == NULL) { break; } sbd_device[i] = strdup(token); } free(str_orig); } value = getenv("SBD_TRANSLATE_AIO"); if ((value) && !strcmp(value, "yes")) { translate_aio = 1; } } } // ***** end - handling of watchdog & block-devices **** static gboolean watchdog_timeout_notify(gpointer data) { fprintf(log_fp, "watchdog fired after %ds - killing process group\n", watchdog_timeout); fclose(log_fp); log_fp = NULL; killpg(0, SIGKILL); exit(1); } static gboolean watchdog_dispatch_callback (gint fd, GIOCondition condition, gpointer user_data) { char buf[256]; int i = 0; if (condition & G_IO_HUP) { return FALSE; } if (watchdog_timer_id > 0) { g_source_remove(watchdog_timer_id); } watchdog_timer_id = 0; for (i = 0; i < sizeof(buf)-1; i++) { ssize_t len; do { len = read(watchdog_pipe[0], &buf[i], 1); } while ((len == -1) && (errno == EINTR)); if (len <= 0) { if (len == -1) { fprintf(log_fp, "Couldn't read from watchdog-pipe\n"); } buf[i] = '\0'; break; } if (buf[i] == '\n') { buf[i] = '\0'; break; } } buf[sizeof(buf)-1] = '\0'; if (sscanf(buf, "trigger %ds", &watchdog_timeout) == 1) { watchdog_timer_id = g_timeout_add(watchdog_timeout * 1000, watchdog_timeout_notify, NULL); } else if (strcmp(buf, "disarm") == 0) { // timer is stopped already } else { fprintf(log_fp, "unknown watchdog command\n"); } return TRUE; } static void watchdog_arm (void) { char buf[256]; if ((watchdog_timeout > 0) && (watchdog_pipe[1] >= 0)) { sprintf(buf, "trigger %ds\n", watchdog_timeout); if (write(watchdog_pipe[1], buf, strlen(buf)) != strlen(buf)) { fprintf(log_fp, "Failed tickling watchdog via pipe\n"); } } } static void watchdog_disarm (void) { char buf[256]; watchdog_timeout = -1; if (watchdog_pipe[1] >= 0) { sprintf(buf, "disarm\n"); if (write(watchdog_pipe[1], buf, strlen(buf)) != strlen(buf)) { fprintf(log_fp, "Failed disarming watchdog via pipe\n"); } } } int open(const char *pathname, int flags, ...) { int i, fd; int devnum = -1; int is_wd_dev = 0; va_list ap; init(); for (i=0; i < 3; i++) { if (sbd_device[i]) { if (strcmp(sbd_device[i], pathname) == 0) { devnum = i; flags &= ~O_DIRECT; break; } } } if (watchdog_device) { if (strcmp(watchdog_device, pathname) == 0) { is_wd_dev = 1; if (watchdog_pipe[1] == -1) { if (pipe(watchdog_pipe) == -1) { fprintf(log_fp, "Creating pipe for watchdog failed\n"); } else { int i; watchdog_pid = fork(); switch (watchdog_pid) { case -1: fprintf(log_fp, "Forking watchdog-child failed\n"); break; case 0: free(watchdog_device); watchdog_device = NULL; for (i = 0; i < 3; i++) { free(sbd_device[i]); sbd_device[i] = NULL; } close(watchdog_pipe[1]); if (fcntl(watchdog_pipe[0], F_SETFL, O_NONBLOCK) == -1) { // don't block on read for timer to be handled fprintf(log_fp, "Failed setting watchdog-pipe-read to non-blocking"); } mainloop = g_main_loop_new(NULL, FALSE); // mainloop_add_signal(SIGTERM, watchdog_shutdown); // mainloop_add_signal(SIGINT, watchdog_shutdown); watchdog_source_id = g_unix_fd_add(watchdog_pipe[0], G_IO_IN, watchdog_dispatch_callback, NULL); if (watchdog_source_id == 0) { fprintf(log_fp, "Failed creating source for watchdog-pipe\n"); exit(1); } g_main_loop_run(mainloop); g_main_loop_unref(mainloop); exit(0); default: close(watchdog_pipe[0]); if (fcntl(watchdog_pipe[1], F_SETFL, O_NONBLOCK) == -1) { fprintf(log_fp, "Failed setting watchdog-pipe-write to non-blocking"); } } } } pathname = "/dev/null"; } } va_start (ap, flags); fd = (flags & (O_CREAT #ifdef O_TMPFILE | O_TMPFILE #endif ))? orig_open(pathname, flags, va_arg(ap, mode_t)): orig_open(pathname, flags); va_end (ap); if (devnum >= 0) { sbd_device_fd[devnum] = fd; } else if (is_wd_dev) { watchdog_device_fd = fd; } return fd; } ssize_t write(int fd, const void *buf, size_t count) { init(); if ((fd == watchdog_device_fd) && (count >= 1)) { if (*(const char *)buf == 'V') { watchdog_disarm(); } else { watchdog_arm(); } } return orig_write(fd, buf, count); } int ioctl(int fd, unsigned long int request, ...) { int rv = -1; va_list ap; int i; init(); va_start(ap, request); switch (request) { case BLKSSZGET: for (i=0; i < 3; i++) { if (sbd_device_fd[i] == fd) { rv = 0; *(va_arg(ap, int *)) = 512; break; } if (i == 2) { rv = orig_ioctl(fd, request, va_arg(ap, int *)); } } break; case WDIOC_SETTIMEOUT: if (fd == watchdog_device_fd) { watchdog_timeout = *va_arg(ap, int *); watchdog_arm(); rv = 0; break; } rv = orig_ioctl(fd, request, va_arg(ap, int *)); break; case WDIOC_SETOPTIONS: if (fd == watchdog_device_fd) { int flags = *va_arg(ap, int *); if (flags & WDIOS_DISABLECARD) { watchdog_disarm(); } rv = 0; break; } rv = orig_ioctl(fd, request, va_arg(ap, int *)); break; case WDIOC_GETSUPPORT: rv = orig_ioctl(fd, request, va_arg(ap, struct watchdog_info *)); break; default: fprintf(log_fp, "ioctl using unknown request = 0x%08lx", request); rv = orig_ioctl(fd, request, va_arg(ap, void *)); } va_end(ap); return rv; } int close(int fd) { int i; init(); if (fd == watchdog_device_fd) { watchdog_device_fd = -1; } else { for (i = 0; i < 3; i++) { if (sbd_device_fd[i] == fd) { sbd_device_fd[i] = -1; break; } } } return orig_close(fd); } // ***** end - handling of watchdog & block-devices **** // ***** handling of sysrq, sysrq-trigger & reboot **** FILE * fopen(const char *pathname, const char *mode) { int is_sysrq = 0; int is_sysrq_trigger = 0; FILE *fp; init(); if ((strcmp("/proc/sys/kernel/sysrq", pathname) == 0) && strcmp("w", mode)) { pathname = "/dev/null"; is_sysrq = 1; } else if (strcmp("/proc/sysrq-trigger", pathname) == 0) { pathname = "/dev/null"; is_sysrq_trigger = 1; } fp = orig_fopen(pathname, mode); if (is_sysrq) { sysrq_fp = fp; } else if (is_sysrq_trigger) { sysrq_trigger_fp = fp; } return fp; } int fclose(FILE *fp) { init(); if (fp == sysrq_fp) { sysrq_fp = NULL; } else if (fp == sysrq_trigger_fp) { sysrq_trigger_fp = NULL; } return orig_fclose(fp); } #if defined(__USE_FORTIFY_LEVEL) && (__USE_FORTIFY_LEVEL > 1) int __fprintf_chk(FILE *stream, int flag, const char *format, ...) #else int fprintf(FILE *stream, const char *format, ...) #endif { va_list ap; int rv; init(); va_start (ap, format); if (stream == sysrq_trigger_fp) { char buf[256]; rv = vsnprintf(buf, sizeof(buf), format, ap); if (rv >= 1) { fprintf(log_fp, "sysrq-trigger ('%c') - %s\n", buf[0], (buf[0] == 'c')?"killing process group":"don't kill but wait for reboot-call"); if (buf[0] == 'c') { fclose(log_fp); log_fp = NULL; killpg(0, SIGKILL); exit(1); } } } else { rv = vfprintf(stream, format, ap); } va_end (ap); return rv; } int fscanf(FILE *stream, const char *format, ...) { va_list ap; int rv; init(); va_start (ap, format); rv = vfscanf(stream, format, ap); va_end (ap); return rv; } int reboot (int __howto) { fprintf(log_fp, "reboot (%s) - exiting inquisitor process\n", (__howto == RB_POWER_OFF)?"poweroff":"reboot"); fclose(log_fp); log_fp = NULL; killpg(0, SIGKILL); exit(1); } // ***** end - handling of sysrq, sysrq-trigger & reboot **** // ***** aio translate **** #if 0 struct iocb { void *data; unsigned key; short aio_lio_opcode; short aio_reqprio; int aio_fildes; }; static inline void io_prep_pread(struct iocb *iocb, int fd, void *buf, size_t count, long long offset) { memset(iocb, 0, sizeof(*iocb)); iocb->aio_fildes = fd; iocb->aio_lio_opcode = IO_CMD_PREAD; iocb->aio_reqprio = 0; iocb->u.c.buf = buf; iocb->u.c.nbytes = count; iocb->u.c.offset = offset; } static inline void io_prep_pwrite(struct iocb *iocb, int fd, void *buf, size_t count, long long offset) { memset(iocb, 0, sizeof(*iocb)); iocb->aio_fildes = fd; iocb->aio_lio_opcode = IO_CMD_PWRITE; iocb->aio_reqprio = 0; iocb->u.c.buf = buf; iocb->u.c.nbytes = count; iocb->u.c.offset = offset; } #endif int io_setup(int nr_events, io_context_t *ctx_idp) { init(); if (!translate_aio) { return orig_io_setup(nr_events, ctx_idp); } if (nr_events == 0) { return EINVAL; } if (nr_events > 1) { return EAGAIN; } if (ctx_idp == NULL) { return EFAULT; } *ctx_idp = &our_io_context; return 0; } int io_destroy(io_context_t ctx_id) { init(); if (!translate_aio) { return orig_io_destroy(ctx_id); } if (ctx_id != &our_io_context) { return EINVAL; } return 0; } int io_submit(io_context_t ctx_id, long nr, struct iocb *ios[]) { init(); if (!translate_aio) { return orig_io_submit(ctx_id, nr, ios); } if ((pending_iocb != NULL) || (nr > 1)) { return EAGAIN; } if ((nr == 1) && ((ios == NULL) || (ios[0] == NULL))) { return EFAULT; } if ((ctx_id != &our_io_context) || (nr < 0) || ((nr == 1) && (ios[0]->aio_lio_opcode != IO_CMD_PREAD) && (ios[0]->aio_lio_opcode != IO_CMD_PWRITE))) { return EINVAL; } if ((fcntl(ios[0]->aio_fildes, F_GETFD) == -1) && (errno == EBADF)) { return EBADF; } if (nr == 1) { pending_iocb = ios[0]; } return nr; } int io_getevents(io_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) { init(); if (!translate_aio) { return orig_io_getevents(ctx_id, min_nr, nr, events, timeout); } if ((ctx_id != &our_io_context) || (min_nr != 1) || (nr != 1)) { return EINVAL; } if (pending_iocb == NULL) { return 0; } switch (pending_iocb->aio_lio_opcode) { case IO_CMD_PWRITE: events->res = pwrite(pending_iocb->aio_fildes, pending_iocb->u.c.buf, pending_iocb->u.c.nbytes, pending_iocb->u.c.offset); break; case IO_CMD_PREAD: events->res = pread(pending_iocb->aio_fildes, pending_iocb->u.c.buf, pending_iocb->u.c.nbytes, pending_iocb->u.c.offset); break; default: events->res = 0; } events->data = pending_iocb->data; events->obj = pending_iocb; events->res2 = 0; pending_iocb = NULL; return 1; } int io_cancel(io_context_t ctx_id, struct iocb *iocb, struct io_event *result) { init(); if (!translate_aio) { return orig_io_cancel(ctx_id, iocb, result); } if (ctx_id != &our_io_context) { return EINVAL; } if ((iocb == NULL) || (result == NULL)) { return EFAULT; } if (pending_iocb != iocb) { return EAGAIN; } result->data = iocb->data; result->obj = iocb; result->res = 0; result->res2 = 0; pending_iocb = NULL; return 0; } // ***** end - aio translate ****