pax_global_header00006660000000000000000000000064122363077660014526gustar00rootroot0000000000000052 comment=508c28708f0a4ddd870306ff6810d57e9bf2ccd9 sheepdog-0.7.5/000077500000000000000000000000001223630776600133355ustar00rootroot00000000000000sheepdog-0.7.5/.gitignore000066400000000000000000000013001223630776600153170ustar00rootroot00000000000000# # Normal rules # .* *.o *.o.* *.a *.s *.ko *.so *.mod.c *.i *.lst *.symtypes *.d *.orig *.rej cscope.* *.gcda *.gcno *.info # # for GLOBAL # GTAGS GRTAGS GPATH GSYMS # # programs # dog/dog sheep/sheep sheepfs/sheepfs shepherd/shepherd tools/zk_control tests/unit/dog/test_common tests/unit/sheep/test_vdi tests/unit/sheep/test_cluster_driver # directories .deps autom4te.cache coverage aclocal.m4 Makefile Makefile.in INSTALL config.log config.status config.guess config.sub configure depcomp install-sh missing push stamp-h1 libtool ltmain.sh config.h config.h.in script/generic tests/check.log tests/check.time tests/atconfig tests/*.out.bad *.patch man/sheep.8 man/dog.8 man/sheepfs.8 *.deb sheepdog-0.7.5/COPYING000066400000000000000000000431101223630776600143670ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Lesser General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. sheepdog-0.7.5/INSTALL000066400000000000000000000045241223630776600143730ustar00rootroot00000000000000=========================== Sheepdog Installation Guide =========================== Run-time dependencies --------------------- * Three or more x86-64 machines. * The corosync and corosync lib package or zookeeper equivalent * QEMU 0.13 or later * liburcu Compile-time dependencies ------------------------- * GNU Autotools * corosync devel package * liburcu devel package * git * optional:fuse-devel (for sheepfs) * optional:libzookeeper-mt-dev (for zookeeper support) Installing from source ------------------------------------ 1. Compile or install the Corosync packages: Nearly every modern Linux distribution has x86_64 corosync binaries pre-built available via their repositories. We recommend you use these packages if they are available on your distribution. For debian package based systems: $ sudo aptitude install corosync libcorosync-dev For RPM package based systems: $ sudo yum install corosynclib-devel For EL6 (RHEL, CentOS, SL, etc), the provided version of corosync is too old and you must install corosync from source. 2. Download, build and install QEMU with Sheepdog support: QEMU 0.13 or later provides built-in support for sheepdog devices. Some distributions provide pre-built versions of this newer version of QEMU. If your distribution has an older version of QEMU or you prefer to compile from source, retrieve the latest QEMU and compile: $ git clone git://git.qemu.org/qemu.git $ cd qemu $ ./configure $ sudo make install 3. Download, build and install the Sheepdog server and command line tools: $ git clone git://github.com/sheepdog/sheepdog.git $ cd sheepdog $ ./autogen.sh $ ./configure $ sudo make install If your want to built-in sheepfs and zookeeper support, try: $ ./configure --enable-zookeeper --enable-sheepfs Please note, sheepdog supports a "make rpm" target which will generate an rpm package that can be installed on the local machine. To use this installation method, use the following instructions: At sheepdog source directory $ make rpm $ sudo rpm -ivh x86_64/sheepdog-0.* Please read the README file, the sheep(8), dog(8) or sheepfs(8) man page for further usage instructions. =============================================================================== Copyright (C) 2009-2011, Nippon Telegraph and Telephone Corporation. sheepdog-0.7.5/Makefile.am000066400000000000000000000051721223630776600153760ustar00rootroot00000000000000SPEC = $(PACKAGE_NAME).spec TARFILE = $(PACKAGE_NAME)-$(VERSION).tar.gz EXTRA_DIST = autogen.sh AUTOMAKE_OPTIONS = foreign MAINTAINERCLEANFILES = Makefile.in aclocal.m4 configure depcomp \ config.guess config.sub missing install-sh \ autoheader automake autoconf config.status \ config.log dist_doc_DATA = sheepdogsysconfdir = ${SHEEPDOGCONFDIR} sheepdogsysconf_DATA = SUBDIRS = lib dog sheep include script shepherd tools if BUILD_SHEEPFS SUBDIRS += sheepfs endif SUBDIRS += man if BUILD_UNITTEST SUBDIRS += tests/unit endif install-exec-local: $(INSTALL) -d $(DESTDIR)/${localstatedir}/lib/sheepdog uninstall-local: rmdir $(DESTDIR)/${localstatedir}/lib/sheepdog || :; dist-clean-local: rm -f autoconf automake autoheader clean-generic: rm -rf $(SPEC) $(TARFILE) cscope* find -name '*.orig' -or -name '*.rej' | xargs rm -f find -name '*.gcno' -or -name '*.gcda' -or -name '*.info' | xargs rm -f cscope: @echo create cscope.out @find -name '*.[chS]' > cscope.files @cscope -bq $(SPEC): $(SPEC).in rm -f $@-t $@ LC_ALL=C date="$(shell date "+%a %b %d %Y")" && \ sed \ -e "s#@version@#$(VERSION)#g" \ -e "s#@date@#$$date#g" \ $< > $@-t chmod a-w $@-t mv $@-t $@ RPMBUILDOPTS = --define "_sourcedir $(abs_builddir)" \ --define "_specdir $(abs_builddir)" \ --define "_builddir $(abs_builddir)" \ --define "_srcrpmdir $(abs_builddir)" \ --define "_rpmdir $(abs_builddir)" $(TARFILE): $(MAKE) dist srpm: clean $(MAKE) $(SPEC) $(TARFILE) rpmbuild $(RPMBUILDOPTS) --nodeps -bs $(SPEC) rpm: clean $(MAKE) $(SPEC) $(TARFILE) rpmbuild $(RPMBUILDOPTS) -ba $(SPEC) deb: fakeroot ./debian/rules clean git log > debian/CHANGELOG rm -f debian/changelog dch -v $(shell echo $(PACKAGE_VERSION) | sed s/_/+/ | sed s/_/./g)-1 \ --package sheepdog --create 'Local build' fakeroot ./debian/rules binary CGCC=cgcc CGCC_CFLAGS=-Wbitwise -Wno-return-void $(ARCH) -fno-common sparse: ARCH=$(shell sh script/checkarch.sh) sparse: $(MAKE) CC=$(CGCC) CFLAGS="$(CFLAGS) $(CGCC_CFLAGS)" CHECK_STYLE=../script/checkpatch.pl -f --no-summary --terse check-style: @for dir in lib dog sheep include sheepfs; do \ make -C $$dir check-style CHECK_STYLE="$(CHECK_STYLE)"; \ done if BUILD_COVERAGE coverage: clean check @rm -rf coverage @for dir in dog sheep tests/unit/dog tests/unit/sheep ; do\ $(MAKE) -C $$dir coverage; \ done @lcov -a dog/dog.info -a sheep/sheep.info \ -a tests/unit/dog/dog.info -a tests/unit/sheep/sheep.info \ -o sheep.info && \ lcov -r sheep.info /usr/include/\* -o sheep.info && \ lcov -r sheep.info tests/unit/\* -o sheep.info && \ genhtml sheep.info -o coverage endif sheepdog-0.7.5/README000066400000000000000000000157271223630776600142310ustar00rootroot00000000000000Sheepdog: Distributed Storage System for KVM ============================================ Overview -------- Sheepdog is a distributed storage system for QEMU. It provides highly available block level storage volumes to virtual machines. Sheepdog supports advanced volume management features such as snapshot, cloning, and thin provisioning. Sheepdog is an Open Source software, released under the terms of the GNU General Public License version 2. For the latest information about Sheepdog, please visit our website at: http://www.osrg.net/sheepdog/ And (recommended for new comers) wiki at: https://github.com/sheepdog/sheepdog/wiki/ Requirements ------------ * Three or more x86-64 machines * Corosync cluster engine Install ------- Please read the INSTALL file distributed with this package for detailed instructions on installing or compiling from source. Usage ----- * Cluster Management Backends Sheepdog uses a cluster management backend to manage membership and broadcast messages to the cluster nodes. For now, sheepdog can use local driver (for development on a single box), corosync (the default), zookeeper and Accord. * Local Driver This driver just makes use of UNIX IPC mechanism to manage the membership on a single box, where we start multiple 'sheep' processes to simulate the cluster. It is very easy and fast setup and especially useful to test functionality without involving any other software. To set up a 3 node cluster using local driver in one liner bash with debug mode: $ mkdir /path/to/store $ for i in 0 1 2; do sheep -c local -d /path/to/store/$i -z $i -p 700$i;sleep 1;done * Configure corosync. Nearly every modern Linux distribution has x86_64 corosync binaries pre-built available via their repositories. We recommend you use these packages if they are available on your distribution. For debian package based systems: $ sudo aptitude install corosync libcorosync-dev For RPM package based systems: $ sudo yum install corosynclib-devel Reference our wiki, the corosync(8) and corosync.conf(5) man page for further details. * Setup Sheepdog 1. Launch sheepdog on each machines of the cluster. $ sheep /store_dir Notes: /store_dir is a directory to store objects. The directory must be on the filesystem with an xattr support. In case of ext3, you need to add 'user_xattr' to the mount options. $ sudo mount -o remount,user_xattr /store_device 2. Make fs $ dog cluster format --copies=3 --copies specifies the number of default data redundancy. In this case, the replicated data is stored on three machines. 3. Check cluster state Following list shows that Sheepdog is running on 32 nodes. $ dog node list Idx Node id (FNV-1a) - Host:Port ------------------------------------------------ 0 0308164db75cff7e - 10.68.13.15:7000 * 1 03104d8b4315c8e4 - 10.68.13.1:7000 2 0ab18c565bc14aea - 10.68.13.3:7000 3 0c0d27f0ac395f5d - 10.68.13.16:7000 4 127ee4802991f308 - 10.68.13.13:7000 5 135ff2beab2a9809 - 10.68.14.5:7000 6 17bd6240eab65870 - 10.68.14.4:7000 7 1cf35757cbf47d7b - 10.68.13.10:7000 8 1df9580b8960a992 - 10.68.13.11:7000 9 29307d3fa5a04f78 - 10.68.14.12:7000 10 29dcb3474e31d4f3 - 10.68.14.15:7000 11 29e089c98dd2a144 - 10.68.14.16:7000 12 2a118b7e2738f479 - 10.68.13.4:7000 13 3d6aea26ba79d75f - 10.68.13.6:7000 14 42f9444ead801767 - 10.68.14.11:7000 15 562c6f38283d09fe - 10.68.14.2:7000 16 5dd5e540cca1556a - 10.68.14.6:7000 17 6c12a5d10f10e291 - 10.68.14.13:7000 18 6dae1d955ca72d96 - 10.68.13.7:7000 19 711db0f5fa40b412 - 10.68.14.14:7000 20 7c6b95212ee7c085 - 10.68.14.9:7000 21 7d010c31bf11df73 - 10.68.13.2:7000 22 82c43e908b1f3f01 - 10.68.13.12:7000 23 931d2de0aaf61cf5 - 10.68.13.8:7000 24 961d9d391e6021e7 - 10.68.13.14:7000 25 9a3ef6fa1081026c - 10.68.13.9:7000 26 b0b3d300fed8bc26 - 10.68.14.10:7000 27 b0f08fb98c8f5edc - 10.68.14.8:7000 28 b9cc316dc5aba880 - 10.68.13.5:7000 29 d9eda1ec29c2eeeb - 10.68.14.7:7000 30 e53cebb2617c86fd - 10.68.14.1:7000 31 ea46913c4999ccdf - 10.68.14.3:7000 * Create a virtual machine image 1. Create a 256 GB virtual machine image of Alice. $ qemu-img create sheepdog:Alice 256G 2. You can also convert from existing KVM images to Sheepdog ones. $ qemu-img convert ~/amd64.raw sheepdog:Bob 3. See Sheepdog images by the following command. $ dog vdi list name id size used shared creation time object id -------------------------------------------------------------------- Bob 0 2.0 GB 1.6 GB 0.0 MB 2010-03-23 16:16 80000 Alice 0 256 GB 0.0 MB 0.0 MB 2010-03-23 16:16 40000 * Boot the virtual machine 1. Boot the virtual machine. $ qemu-system-x86_64 -hda sheepdog:Alice 2. Following command checks used images. $ dog vm list Name |Vdi size |Allocated| Shared | Status ----------------+---------+---------+---------+------------ Bob | 2.0 GB| 1.6 GB| 0.0 MB| running on xx.xx.xx.xx Alice | 256 GB| 0.0 MB| 0.0 MB| not running * Snapshot 1. Snapshot $ qemu-img snapshot -c name sheepdog:Alice -c flag is meaningless currently 2. After getting snapshot, a new virtual machine images are added as a not- current image. $ dog vdi list name id size used shared creation time object id -------------------------------------------------------------------- Bob 0 2.0 GB 1.6 GB 0.0 MB 2010-03-23 16:16 80000 Alice 0 256 GB 0.0 MB 0.0 MB 2010-03-23 16:21 c0000 s Alice 1 256 GB 0.0 MB 0.0 MB 2010-03-23 16:16 40000 3. You can boot from the snapshot image by spcifing tag id $ qemu-system-x86_64 -hda sheepdog:Alice:1 * Cloning from the snapshot 1. Create a Charlie image as a clone of Alice's image. $ qemu-img create -b sheepdog:Alice:1 sheepdog:Charlie 2. Charlie's image is added to the virtual machine list. $ dog vdi list name id size used shared creation time object id -------------------------------------------------------------------- Bob 0 2.0 GB 1.6 GB 0.0 MB 2010-03-23 16:16 80000 Alice 0 256 GB 0.0 MB 0.0 MB 2010-03-23 16:21 c0000 s Alice 1 256 GB 0.0 MB 0.0 MB 2010-03-23 16:16 40000 Charlie 0 256 GB 0.0 MB 0.0 MB 2010-03-23 16:23 100000 Test Environment ---------------- - Debian squeeze amd64 - Debian lenny amd64 =============================================================================== Copyright (C) 2009-2011, Nippon Telegraph and Telephone Corporation. sheepdog-0.7.5/autogen.sh000077500000000000000000000002331223630776600153340ustar00rootroot00000000000000#!/bin/sh # Run this to generate all the initial makefiles, etc. echo Building configuration system... autoreconf -i && echo Now run ./configure and make sheepdog-0.7.5/configure.ac000066400000000000000000000310151223630776600156230ustar00rootroot00000000000000# # Copyright 2010 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; see the file COPYING. If not, write to # the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. # # bootstrap / init AC_PREREQ([2.61]) m4_define([sheepdog_version], [0.7.5]) m4_define([git_version], m4_esyscmd([git describe --tags --dirty 2> /dev/null | sed 's/^v//' \ | tr '-' '_' | tr -d '\n'])) AC_INIT([sheepdog], m4_default(git_version, sheepdog_version), [sheepdog@lists.wpkg.org]) AM_INIT_AUTOMAKE([-Wno-portability]) AC_CONFIG_SRCDIR([dog/dog.c]) AC_CONFIG_HEADER([include/config.h]) AC_CANONICAL_HOST AC_LANG([C]) AM_SILENT_RULES([yes]) m4_ifndef([PKG_PROG_PKG_CONFIG], AC_MSG_ERROR([pkg-config not found])) dnl Fix default variables - "prefix" variable if not specified if test "$prefix" = "NONE"; then prefix="/usr" dnl Fix "localstatedir" variable if not specified if test "$localstatedir" = "\${prefix}/var"; then localstatedir="/var" fi dnl Fix "sysconfdir" variable if not specified if test "$sysconfdir" = "\${prefix}/etc"; then sysconfdir="/etc" fi dnl Fix "libdir" variable if not specified if test "$libdir" = "\${exec_prefix}/lib"; then if test -e /usr/lib64; then libdir="/usr/lib64" else libdir="/usr/lib" fi fi fi # check stolen from gnulib/m4/gnu-make.m4 if ! ${MAKE-make} --version /cannot/make/this >/dev/null 2>&1; then AC_MSG_ERROR([you don't seem to have GNU make; it is required]) fi AC_PROG_CC AM_PROG_AS AC_PROG_INSTALL AC_PROG_LN_S AC_PROG_MAKE_SET AC_PROG_RANLIB AC_CHECK_PROGS([GROFF], [groff]) AM_MISSING_PROG(AUTOM4TE, autom4te, $missing_dir) # Checks for libraries. AC_CHECK_LIB([socket], [socket]) # Checks for header files. AC_FUNC_ALLOCA AC_HEADER_DIRENT AC_HEADER_STDC AC_HEADER_SYS_WAIT AC_CHECK_HEADERS([arpa/inet.h fcntl.h limits.h netdb.h netinet/in.h stdint.h \ stdlib.h string.h sys/ioctl.h sys/param.h sys/socket.h \ sys/time.h syslog.h unistd.h sys/types.h getopt.h malloc.h \ sys/sockio.h utmpx.h]) AC_CHECK_HEADERS([urcu.h urcu/uatomic.h],, AC_MSG_ERROR(liburcu 0.6.0 or later is required)) # Checks for typedefs, structures, and compiler characteristics. AC_C_CONST AC_TYPE_UID_T AC_C_INLINE AC_TYPE_INT16_T AC_TYPE_INT32_T AC_TYPE_INT64_T AC_TYPE_INT8_T AC_TYPE_SIZE_T AC_TYPE_SSIZE_T AC_HEADER_TIME AC_TYPE_UINT16_T AC_TYPE_UINT32_T AC_TYPE_UINT64_T AC_TYPE_UINT8_T AC_C_VOLATILE # Checks for library functions. AC_FUNC_CLOSEDIR_VOID AC_FUNC_ERROR_AT_LINE AC_REPLACE_FNMATCH AC_FUNC_FORK AC_PROG_GCC_TRADITIONAL AC_FUNC_MALLOC AC_FUNC_MEMCMP AC_FUNC_REALLOC AC_FUNC_SELECT_ARGTYPES AC_TYPE_SIGNAL AC_FUNC_VPRINTF AC_CHECK_FUNCS([alarm alphasort atexit bzero dup2 endgrent endpwent fcntl \ getcwd getpeerucred getpeereid gettimeofday inet_ntoa memmove \ memset mkdir scandir select socket strcasecmp strchr strdup \ strerror strrchr strspn strstr]) AC_CONFIG_FILES([Makefile dog/Makefile sheep/Makefile sheepfs/Makefile include/Makefile script/Makefile lib/Makefile man/Makefile shepherd/Makefile tests/unit/Makefile tests/unit/mock/Makefile tests/unit/dog/Makefile tests/unit/sheep/Makefile tools/Makefile]) ### Local business # =============================================== # Helpers # =============================================== ## helper for CC stuff cc_supports_flag() { local CFLAGS="$@" AC_MSG_CHECKING([whether $CC supports "$@"]) AC_COMPILE_IFELSE([AC_LANG_SOURCE([int main(){return 0;}])] , [RC=0; AC_MSG_RESULT([yes])], [RC=1; AC_MSG_RESULT([no])]) return $RC } ## cleanup AC_MSG_NOTICE(Sanitizing prefix: ${prefix}) case $prefix in NONE) prefix=/usr/local;; esac AC_MSG_NOTICE(Sanitizing exec_prefix: ${exec_prefix}) case $exec_prefix in dnl For consistency with Sheepdog, map NONE->$prefix NONE) exec_prefix=$prefix;; prefix) exec_prefix=$prefix;; esac ## local defines PACKAGE_FEATURES="" LINT_FLAGS="-weak -unrecog +posixlib +ignoresigns -fcnuse \ -badflag -D__gnuc_va_list=va_list -D__attribute\(x\)=" AC_ARG_ENABLE([fatal-warnings], [ --enable-fatal-warnings : enable fatal warnings. ], [ default="no" ]) AC_ARG_ENABLE([debug], [ --enable-debug : enable debug build. ], [ default="no" ]) AC_ARG_ENABLE([unittest], [ --enable-unittest : enable unittest. ], [ default="no" ]) AC_ARG_ENABLE([coverage], [ --enable-coverage : coverage analysis of the codebase. ], [ default="no" ]) AM_CONDITIONAL(BUILD_COVERAGE, test x$enable_coverage = xyes) AC_ARG_ENABLE([corosync], [ --enable-corosync : build corosync cluster driver ],, [ enable_corosync="yes" ],) AM_CONDITIONAL(BUILD_COROSYNC, test x$enable_corosync = xyes) AC_ARG_ENABLE([zookeeper], [ --enable-zookeeper : build zookeeper cluster driver ],, [ enable_zookeeper="no" ],) AM_CONDITIONAL(BUILD_ZOOKEEPER, test x$enable_zookeeper = xyes) AC_ARG_WITH([initddir], [ --with-initddir=DIR : path to init script directory. ], [ INITDDIR="$withval" ], [ INITDDIR="$sysconfdir/init.d" ]) AC_ARG_ENABLE([trace], [ --enable-trace : enable trace],, [ enable_trace="${enable_debug}" ],) AM_CONDITIONAL(BUILD_TRACE, test x$enable_trace = xyes) PKG_CHECK_MODULES([fuse],[fuse], HAVE_FUSE="yes", HAVE_FUSE="no") AC_ARG_ENABLE([sheepfs], [ --enable-sheepfs : enable sheepfs],, [ enable_sheepfs=$HAVE_FUSE ],) AM_CONDITIONAL(BUILD_SHEEPFS, test x$enable_sheepfs = xyes) AC_ARG_ENABLE([http], [ --enable-http : enable http request service (default no) ],, [ enable_http="no" ],) AM_CONDITIONAL(BUILD_HTTP, test x$enable_http = xyes) CP=cp OS_LDL="-ldl" case "$host_os" in *linux*) AC_DEFINE_UNQUOTED([SHEEPDOG_LINUX], [1], [Compiling for Linux platform]) OS_CFLAGS="" OS_CPPFLAGS="" OS_LDFLAGS="" OS_DYFLAGS="" DARWIN_OPTS="" ;; *) AC_MSG_ERROR([Unsupported OS? hmmmm]) ;; esac AC_SUBST(CP) # *FLAGS handling goes here ENV_CFLAGS="$CFLAGS" ENV_CPPFLAGS="$CPPFLAGS" ENV_LDFLAGS="$LDFLAGS" # debug build stuff if test "x${enable_debug}" = xyes; then AC_DEFINE_UNQUOTED([DEBUG], [1], [Compiling Debugging code]) OPT_CFLAGS="-O0" PACKAGE_FEATURES="$PACKAGE_FEATURES debug" else OPT_CFLAGS="-DNDEBUG" fi # gdb flags if test "x${GCC}" = xyes; then GDB_FLAGS="-ggdb3" else GDB_FLAGS="-g" fi if test "x${enable_corosync}" = xyes; then PKG_CHECK_MODULES([corosync],[corosync]) PKG_CHECK_MODULES([libcpg],[libcpg]) PKG_CHECK_MODULES([libcfg],[libcfg]) AC_DEFINE_UNQUOTED([HAVE_COROSYNC], 1, [have corosync]) PACKAGE_FEATURES="$PACKAGE_FEATURES corosync" fi if test "x${enable_zookeeper}" = xyes; then AC_CHECK_LIB([zookeeper_mt], [zookeeper_init],, AC_MSG_ERROR(libzookeeper not found)) AC_CHECK_HEADERS([zookeeper/zookeeper.h],, AC_MSG_ERROR(zookeeper.h header missing)) AC_DEFINE_UNQUOTED([HAVE_ZOOKEEPER], 1, [have zookeeper]) PACKAGE_FEATURES="$PACKAGE_FEATURES zookeeper" fi if test "x${enable_trace}" = xyes; then if test "x${enable_coverage}" = xyes; then AC_MSG_ERROR(tracer cannot be used with coverage options) fi if [[[ $host != *x86_64* ]]]; then AC_MSG_ERROR(tracer can be used on x86_64 architectures) fi AC_CHECK_LIB([bfd], [bfd_openr],, AC_MSG_ERROR(requires binutils-dev)) AC_CHECK_HEADERS([bfd.h],, AC_MSG_ERROR(requires binutils-dev)) AC_CHECK_LIB([rt], [clock_gettime],, AC_MSG_ERROR(librt not found)) AC_DEFINE_UNQUOTED([HAVE_TRACE], 1, [have trace]) PACKAGE_FEATURES="$PACKAGE_FEATURES trace" fi if test "x${enable_sheepfs}" = xyes; then AC_CHECK_HEADERS([fuse.h],, AC_MSG_ERROR(fuse.h header missing), [#define _FILE_OFFSET_BITS 64]) AC_DEFINE_UNQUOTED([HAVE_SHEEPFS], 1, [have sheepfs]) PACKAGE_FEATURES="$PACKAGE_FEATURES sheepfs" fi if test "x${enable_http}" = xyes; then AC_CHECK_HEADERS([fcgiapp.h],, AC_MSG_ERROR(fcgiapp.h header not found)) AC_CHECK_LIB([fcgi], [FCGX_Accept],, AC_MSG_ERROR(libfcgi not found)) AC_DEFINE_UNQUOTED(HAVE_HTTP, 1, [have http]) PACKAGE_FEATURES="$PACKAGE_FEATURES http" fi # extra warnings EXTRA_WARNINGS="" WARNLIST=" all shadow missing-prototypes missing-declarations strict-prototypes pointer-arith write-strings bad-function-cast missing-format-attribute format=2 format-security format-nonliteral no-long-long unsigned-char gnu89-inline no-strict-aliasing " case "${host}" in arm*) ;; *) WARNLIST="${WARNLIST} cast-align" ;; esac for j in $WARNLIST; do if cc_supports_flag -W$j; then EXTRA_WARNINGS="$EXTRA_WARNINGS -W$j"; fi done if test "x${enable_coverage}" = xyes && \ cc_supports_flag -ftest-coverage && \ cc_supports_flag -fprofile-arcs ; then AC_MSG_NOTICE([Enabling Coverage (enable -O0 by default)]) OPT_CFLAGS="-O0" COVERAGE_CFLAGS="-ftest-coverage -fprofile-arcs" COVERAGE_LDFLAGS="-ftest-coverage -fprofile-arcs" PACKAGE_FEATURES="$PACKAGE_FEATURES coverage" enable_unittest="yes" else COVERAGE_CFLAGS="" COVERAGE_LDFLAGS="" fi if test "x${enable_unittest}" = xyes; then PKG_CHECK_MODULES([CHECK], [check >= 0.9.4]) fi AM_CONDITIONAL(BUILD_UNITTEST, test x$enable_unittest = xyes) if test "x${enable_fatal_warnings}" = xyes && \ cc_supports_flag -Werror ; then AC_MSG_NOTICE([Enabling Fatal Warnings (-Werror)]) WERROR_CFLAGS="-Werror" PACKAGE_FEATURES="$PACKAGE_FEATURES fatal-warnings" else WERROR_CFLAGS="" fi if test "x${enable_trace}" = xyes && \ cc_supports_flag -pg ; then AC_MSG_NOTICE([Enabling trace (-pg)]) TRACE_CFLAGS="-pg" else TRACE_CFLAGS="" fi # final build of *FLAGS CFLAGS="$ENV_CFLAGS $OPT_CFLAGS $GDB_FLAGS $OS_CFLAGS \ $TRACE_CFLAGS $COVERAGE_CFLAGS $EXTRA_WARNINGS $WERROR_CFLAGS \ -D_GNU_SOURCE -D_LGPL_SOURCE -std=gnu99" CPPFLAGS="$ENV_CPPFLAGS $ANSI_CPPFLAGS $OS_CPPFLAGS" LDFLAGS="$ENV_LDFLAGS $COVERAGE_LDFLAGS $OS_LDFLAGS" # substitute what we need: AC_SUBST([OS_DYFLAGS]) AM_CONDITIONAL(BUILD_HTML_DOCS, test -n "${GROFF}") AC_SUBST([INITDDIR]) AC_SUBST([LINT_FLAGS]) AC_DEFINE_UNQUOTED([LOCALSTATEDIR], "$(eval echo ${localstatedir})", [localstate directory]) COROSYSCONFDIR=${sysconfdir}/sheepdog AC_SUBST([COROSYSCONFDIR]) AC_DEFINE_UNQUOTED([COROSYSCONFDIR], "$(eval echo ${COROSYSCONFDIR})", [sheepdog config directory]) AC_DEFINE_UNQUOTED([PACKAGE_FEATURES], "${PACKAGE_FEATURES}", [sheepdog built-in features]) AC_OUTPUT AC_MSG_RESULT([]) AC_MSG_RESULT([$PACKAGE configuration:]) AC_MSG_RESULT([ Version = ${VERSION}]) AC_MSG_RESULT([ Prefix = ${prefix}]) AC_MSG_RESULT([ Executables = ${sbindir}]) AC_MSG_RESULT([ Man pages = ${mandir}]) AC_MSG_RESULT([ Doc dir = ${docdir}]) AC_MSG_RESULT([ Libraries = ${libdir}]) AC_MSG_RESULT([ Header files = ${includedir}]) AC_MSG_RESULT([ Arch-independent files = ${datadir}]) AC_MSG_RESULT([ State information = ${localstatedir}]) AC_MSG_RESULT([ System configuration = ${sysconfdir}]) AC_MSG_RESULT([ System init.d directory = ${INITDDIR}]) AC_MSG_RESULT([ sheepdog config dir = ${COROSYSCONFDIR}]) AC_MSG_RESULT([ Features =${PACKAGE_FEATURES}]) AC_MSG_RESULT([]) AC_MSG_RESULT([$PACKAGE build info:]) AC_MSG_RESULT([ Library SONAME = ${SONAME}]) LIB_MSG_RESULT(m4_shift(local_soname_list))dnl AC_MSG_RESULT([ Default optimization = ${OPT_CFLAGS}]) AC_MSG_RESULT([ Default debug options = ${GDB_CFLAGS}]) AC_MSG_RESULT([ Extra compiler warnings = ${EXTRA_WARNING}]) AC_MSG_RESULT([ Env. defined CFLAG = ${ENV_CFLAGS}]) AC_MSG_RESULT([ Env. defined CPPFLAGS = ${ENV_CPPFLAGS}]) AC_MSG_RESULT([ Env. defined LDFLAGS = ${ENV_LDFLAGS}]) AC_MSG_RESULT([ OS defined CFLAGS = ${OS_CFLAGS}]) AC_MSG_RESULT([ OS defined CPPFLAGS = ${OS_CPPFLAGS}]) AC_MSG_RESULT([ OS defined LDFLAGS = ${OS_LDFLAGS}]) AC_MSG_RESULT([ OS defined LDL = ${OS_LDL}]) AC_MSG_RESULT([ OS defined DYFLAGS = ${OS_DYFLAGS}]) AC_MSG_RESULT([ ANSI defined CPPFLAGS = ${ANSI_CPPFLAGS}]) AC_MSG_RESULT([ Coverage CFLAGS = ${COVERAGE_CFLAGS}]) AC_MSG_RESULT([ Coverage LDFLAGS = ${COVERAGE_LDFLAGS}]) AC_MSG_RESULT([ Fatal War. CFLAGS = ${WERROR_CFLAGS}]) AC_MSG_RESULT([ Trace CFLAGS = ${TRACE_CFLAGS}]) AC_MSG_RESULT([ Final CFLAGS = ${CFLAGS}]) AC_MSG_RESULT([ Final CPPFLAGS = ${CPPFLAGS}]) AC_MSG_RESULT([ Final LDFLAGS = ${LDFLAGS}]) sheepdog-0.7.5/debian/000077500000000000000000000000001223630776600145575ustar00rootroot00000000000000sheepdog-0.7.5/debian/.gitignore000066400000000000000000000001561223630776600165510ustar00rootroot00000000000000*.substvars *.debhelper *.debhelper.log autoreconf.after autoreconf.before files sheepdog CHANGELOG changelog sheepdog-0.7.5/debian/compat000066400000000000000000000000021223630776600157550ustar00rootroot000000000000009 sheepdog-0.7.5/debian/control000066400000000000000000000017041223630776600161640ustar00rootroot00000000000000Source: sheepdog Section: admin Priority: optional Maintainer: PKG OpenStack Uploaders: YunQiang Su Build-Depends: debhelper (>= 9), dh-autoreconf, bash-completion, pkg-config, libcorosync-dev, liburcu-dev, libzookeeper-mt-dev [linux-any], libfuse-dev, po-debconf Standards-Version: 3.9.4 Homepage: http://sheepdog.github.io/sheepdog Vcs-Browser: http://anonscm.debian.org/?p=openstack/sheepdog.git Vcs-Git: git://anonscm.debian.org/openstack/sheepdog.git Package: sheepdog Architecture: any Pre-Depends: dpkg (>= 1.15.6~) Depends: ${shlibs:Depends}, ${misc:Depends} Recommends: corosync Description: distributed storage system for QEMU Sheepdog provides highly available block level storage volumes that can be attached to QEMU virtual machines. Sheepdog scales to several hundred nodes, and supports advanced volume management features such as snapshots, cloning, and thin provisioning. sheepdog-0.7.5/debian/copyright000066400000000000000000000030101223630776600165040ustar00rootroot00000000000000Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Contact: MORITA Kazutaka , Liu Yuan Upstream-Name: sheepdog Source: git://github.com/sheepdog/sheepdog.git Files: debian/* Copyright: 2010, Guido Günther 2012, YunQiang Su 2012, Thomas Goirand License: GPL-2 Files: * Copyright: 2009-2011 Nippon Telegraph and Telephone Corporation With upstream authors as folow: 2009-2011, MORITA Kazutaka 2009-2011, FUJITA Tomonori 2009-2011, MORIAI Satoshi License: GPL-2 License: GPL-2 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. . This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. . You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA . On Debian systems, the complete text of the GNU General Public License v2 (GPL) can be found in /usr/share/common-licenses/GPL-2. sheepdog-0.7.5/debian/debian-sheepdog-default000066400000000000000000000015221223630776600211420ustar00rootroot00000000000000# start sheepdog at boot [yes|no] START="yes" # Arguments to run the daemon with # Options: # -p, --port specify the TCP port on which to listen # -l, --loglevel specify the level of logging detail # -d, --debug include debug messages in the log # -D, --directio use direct IO when accessing the object store # -z, --zone specify the zone id # -c, --cluster specify the cluster driver DAEMON_ARGS="" # SHEEPDOG_PATH # Proper LSB systems will store sheepdog files in /var/lib/sheepdog. The init script uses this directory by default. # The directory must be on a filesystem with xattr support. In the case of ext3, user_xattr should be added to the # mount options. # # mount -o remount,user_xattr /var/lib/shepdog SHEEPDOG_PATH="/var/lib/sheepdog" sheepdog-0.7.5/debian/docs000066400000000000000000000000071223630776600154270ustar00rootroot00000000000000README sheepdog-0.7.5/debian/gbp.conf000066400000000000000000000002401223630776600161720ustar00rootroot00000000000000[DEFAULT] upstream-branch = master debian-branch = debian/unstable upstream-tag = v%(version)s compression = xz [git-buildpackage] export-dir = ../build-area/ sheepdog-0.7.5/debian/po/000077500000000000000000000000001223630776600151755ustar00rootroot00000000000000sheepdog-0.7.5/debian/po/POTFILES.in000066400000000000000000000000551223630776600167520ustar00rootroot00000000000000[type: gettext/rfc822deb] sheepdog.templates sheepdog-0.7.5/debian/rules000077500000000000000000000040361223630776600156420ustar00rootroot00000000000000#!/usr/bin/make -f # -*- makefile -*- DEBVERS ?= $(shell dpkg-parsechangelog | sed -n -e 's/^Version: //p') VERSION ?= $(shell echo '$(DEBVERS)' | sed -e 's/^[[:digit:]]*://' -e 's/[-].*//') DEBFLAVOR ?= $(shell dpkg-parsechangelog | grep -E ^Distribution: | cut -d" " -f2) DEBPKGNAME ?= $(shell dpkg-parsechangelog | grep -E ^Source: | cut -d" " -f2) UPSTREAM_GIT ?= git://github.com/sheepdog/sheepdog.git GIT_TAG ?= $(shell echo v'$(VERSION)' | sed -e 's/~/_/') %: dh $@ --with autoreconf override_dh_builddeb: dh_builddeb -- -Zxz -z9 override_dh_autoreconf: dh_autoreconf --mode=timesize override_dh_auto_build: dh_auto_build --parallel UNAME := $(shell uname) ifeq ($(UNAME),Linux) ZOOKEEPER=--enable-zookeeper endif override_dh_auto_configure: dh_auto_configure -- ${ZOOKEEPER} override_dh_install: dh_install rm -rf debian/sheepdog/etc/init.d/ dh_bash-completion get-vcs-source: git remote add upstream $(UPSTREAM_GIT) || true git fetch upstream if [ ! -f ../$(DEBPKGNAME)_$(VERSION).orig.tar.xz ] ; then \ git archive --prefix=$(DEBPKGNAME)-$(GIT_TAG)/ $(GIT_TAG) | xz >../$(DEBPKGNAME)_$(VERSION).orig.tar.xz ; \ fi if [ ! -e ../build-area ] ; then mkdir ../build-area ; fi if [ ! -e ../build-area ] ; then cp ../$(DEBPKGNAME)_$(VERSION).orig.tar.xz ../build-area ; fi if ! git checkout master ; then \ echo "No upstream branch: checking out" ; \ git checkout -b master upstream/master ; \ fi git checkout debian/$(DEBFLAVOR) display-po-stats: cd $(CURDIR)/debian/po; for i in *.po ;do \ echo -n $$i": ";\ msgfmt -o /dev/null --statistic $$i ; \ done call-for-po-trans: podebconf-report-po --call --withtranslators --languageteam gen-upstream-changelog: git checkout master git reset --hard $(GIT_TAG) git log >$(CURDIR)/../CHANGELOG git checkout debian/$(DEBFLAVOR) mv $(CURDIR)/../CHANGELOG $(CURDIR)/debian/CHANGELOG git add $(CURDIR)/debian/CHANGELOG git commit -a -m "Updated upstream changelog" override_dh_installchangelogs: dh_installchangelogs $(CURDIR)/debian/CHANGELOG sheepdog-0.7.5/debian/sheepdog.bash-completion000066400000000000000000000001011223630776600213530ustar00rootroot00000000000000script/bash_completion_dog dog script/bash_completion_dog collie sheepdog-0.7.5/debian/sheepdog.config000066400000000000000000000005631223630776600175500ustar00rootroot00000000000000#!/bin/sh set -e . /usr/share/debconf/confmodule if [ -r /etc/default/sheepdog ] ; then . /etc/default/sheepdog if [ x"yes" = x"$START" ] ; then db_set sheepdog/start true else db_set sheepdog/start false fi db_set sheepdog/daemon_args "$DAEMON_ARGS" fi db_input medium sheepdog/start || true db_input medium sheepdog/daemon_args || true db_go || true exit 0 sheepdog-0.7.5/debian/sheepdog.init000066400000000000000000000101041223630776600172360ustar00rootroot00000000000000#!/bin/sh ### BEGIN INIT INFO # Provides: sheepdog # Required-Start: hostname $network $remote_fs $syslog # Required-Stop: $remote_fs # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Short-Description: Sheepdog is a distributed storage system for KVM/QEMU. # Description: Sheepdog is a distributed storage system for KVM/QEMU. It provides # highly available block level storage volumes to virtual machines. # Sheepdog supports advanced volume management features such as snapshot, # cloning, and thin provisioning. The architecture of Sheepdog is fully # symmetric; there is no central node such as a meta-data server. ### END INIT INFO # Author: YunQiang Su # PATH should only include /usr/* if it runs after the mountnfs.sh script PATH=/sbin:/usr/sbin:/bin:/usr/bin DESC=sheepdog # Introduce a short description here NAME=sheepdog # Introduce the short server's name here DAEMON=/usr/sbin/sheep # Introduce the server's location here DAEMON_ARGS="" # Arguments to run the daemon with PIDFILE=/var/run/$NAME.pid SCRIPTNAME=/etc/init.d/$NAME # Exit if the package is not installed [ -x $DAEMON ] || exit 0 # Read configuration variable file if it is present [ -r /etc/default/$NAME ] && . /etc/default/$NAME #FIXME: user cannot give pidfile in /etc/default/sheepdog DAEMON_ARGS="$DAEMON_ARGS --pidfile $PIDFILE" if [ "$START" != "yes" ]; then exit 0 fi # Define LSB log_* functions. # Depend on lsb-base (>= 3.0-6) to ensure that this file is present. . /lib/lsb/init-functions # # Function that starts the daemon/service # do_start() { # Return # 0 if daemon has been started # 1 if daemon was already running # 2 if daemon could not be started start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON --test > /dev/null \ || return 1 start-stop-daemon --start --quiet --pidfile $PIDFILE --exec $DAEMON -- \ $DAEMON_ARGS $SHEEPDOG_PATH \ || return 2 # Add code here, if necessary, that waits for the process to be ready # to handle requests from services started subsequently which depend # on this one. As a last resort, sleep for some time. } # # Function that stops the daemon/service # do_stop() { # Return # 0 if daemon has been stopped # 1 if daemon was already stopped # 2 if daemon could not be stopped # other if a failure occurred start-stop-daemon --stop --quiet --pidfile $PIDFILE RETVAL="$?" return "$RETVAL" } # # Function that sends a SIGHUP to the daemon/service # do_reload() { # # If the daemon can reload its configuration without # restarting (for example, when it is sent a SIGHUP), # then implement that here. # start-stop-daemon --stop --signal 1 --quiet --pidfile $PIDFILE --name $NAME return 0 } case "$1" in start) [ "$VERBOSE" != no ] && log_daemon_msg "Starting $DESC " "$NAME" do_start case "$?" in 0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;; 2) [ "$VERBOSE" != no ] && log_end_msg 1 ;; esac ;; stop) [ "$VERBOSE" != no ] && log_daemon_msg "Stopping $DESC" "$NAME" do_stop case "$?" in 0|1) [ "$VERBOSE" != no ] && log_end_msg 0 ;; 2) [ "$VERBOSE" != no ] && log_end_msg 1 ;; esac ;; status) status_of_proc "$DAEMON" "$NAME" && exit 0 || exit $? ;; #reload|force-reload) # # If do_reload() is not implemented then leave this commented out # and leave 'force-reload' as an alias for 'restart'. # #log_daemon_msg "Reloading $DESC" "$NAME" #do_reload #log_end_msg $? #;; restart|force-reload) # # If the "reload" option is implemented then remove the # 'force-reload' alias # log_daemon_msg "Restarting $DESC" "$NAME" do_stop case "$?" in 0|1) do_start case "$?" in 0) log_end_msg 0 ;; 1) log_end_msg 1 ;; # Old process is still running *) log_end_msg 1 ;; # Failed to start esac ;; *) # Failed to stop log_end_msg 1 ;; esac ;; *) #echo "Usage: $SCRIPTNAME {start|stop|restart|reload|force-reload}" >&2 echo "Usage: $SCRIPTNAME {start|stop|status|restart|force-reload}" >&2 exit 3 ;; esac exit 0 sheepdog-0.7.5/debian/sheepdog.install000066400000000000000000000000631223630776600177440ustar00rootroot00000000000000debian/debian-sheepdog-default /usr/share/sheepdog sheepdog-0.7.5/debian/sheepdog.links000066400000000000000000000000371223630776600174170ustar00rootroot00000000000000/usr/sbin/dog /usr/sbin/collie sheepdog-0.7.5/debian/sheepdog.postinst000066400000000000000000000012111223630776600201550ustar00rootroot00000000000000#!/bin/sh set -e if [ "$1" = "configure" ] ; then . /usr/share/debconf/confmodule mkdir -p /var/lib/sheepdog/ mkdir -p /etc/default if [ ! -e /etc/default/sheepdog ] ; then cp /usr/share/sheepdog/debian-sheepdog-default /etc/default/sheepdog fi if [ -r /etc/default/sheepdog ] ; then db_get sheepdog/start if [ "${RET}" = "true" ] ; then SERVICE_START="yes" else SERVICE_START="no" fi sed -i -e "s/^[ \t]*START=.*/START=\"$SERVICE_START\"/g" /etc/default/sheepdog db_get sheepdog/daemon_args sed -i -e "s/^[ \t]*DAEMON_ARGS=.*/DAEMON_ARGS=\"$RET\"/g" /etc/default/sheepdog fi db_stop || true fi #DEBHELPER# exit 0 sheepdog-0.7.5/debian/sheepdog.postrm000066400000000000000000000002011223630776600176140ustar00rootroot00000000000000#!/bin/sh set -e if [ "${1}" = "purge" ] ; then rm -f /etc/default/sheepdog rm -rf /var/lib/sheepdog fi #DEBHELPER# exit 0 sheepdog-0.7.5/debian/sheepdog.templates000066400000000000000000000023651223630776600203030ustar00rootroot00000000000000# These templates have been reviewed by the debian-l10n-english # team # # If modifications/additions/rewording are needed, please ask # debian-l10n-english@lists.debian.org for advice. # # Even minor modifications require translation updates and such # changes should be coordinated with translators and reviewers. Template: sheepdog/start Type: boolean Default: false _Description: Automatically start the sheepdog service? Please choose whether the sheepdog service should start automatically when the system is booted. Template: sheepdog/daemon_args Type: string Default: _Description: Arguments for the sheepdog daemon: Please choose the command line arguments that should be passed to the sheepdog daemon. If no argument is given, the default behavior is to start on port 7000, using the corosync driver. . Available options include: -p, --port specify the TCP port to listen to -l, --loglevel specify the level of logging detail -d, --debug include debug messages in the log -D, --directio use direct I/O when accessing the object store -z, --zone specify the zone ID -c, --cluster specify the cluster driver More information can be found in the sheep(8) manual page. sheepdog-0.7.5/debian/source/000077500000000000000000000000001223630776600160575ustar00rootroot00000000000000sheepdog-0.7.5/debian/source/format000066400000000000000000000000141223630776600172650ustar00rootroot000000000000003.0 (quilt) sheepdog-0.7.5/debian/watch000066400000000000000000000001041223630776600156030ustar00rootroot00000000000000version=3 https://github.com/sheepdog/sheepdog/tags .*/v(.*).tar.gz sheepdog-0.7.5/doc/000077500000000000000000000000001223630776600141025ustar00rootroot00000000000000sheepdog-0.7.5/doc/api-strbuf.txt000066400000000000000000000147561223630776600167340ustar00rootroot00000000000000strbuf API ========== strbuf's are meant to be used with all the usual C string and memory APIs. Given that the length of the buffer is known, it's often better to use the mem* functions than a str* one (memchr vs. strchr e.g.). Though, one has to be careful about the fact that str* functions often stop on NULs and that strbufs may have embedded NULs. An strbuf is NUL terminated for convenience, but no function in the strbuf API actually relies on the string being free of NULs. strbufs has some invariants that are very important to keep in mind: . The `buf` member is never NULL, so it can be used in any usual C string operations safely. strbuf's _have_ to be initialized either by `strbuf_init()` or by `= STRBUF_INIT` before the invariants, though. + Do *not* assume anything on what `buf` really is (e.g. if it is allocated memory or not), use `strbuf_detach()` to unwrap a memory buffer from its strbuf shell in a safe way. That is the sole supported way. This will give you a malloced buffer that you can later `free()`. + However, it is totally safe to modify anything in the string pointed by the `buf` member, between the indices `0` and `len-1` (inclusive). . The `buf` member is a byte array that has at least `len + 1` bytes allocated. The extra byte is used to store a `'\0'`, allowing the `buf` member to be a valid C-string. Every strbuf function ensure this invariant is preserved. + NOTE: It is OK to "play" with the buffer directly if you work it this way: + ---- strbuf_grow(sb, SOME_SIZE); <1> strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE); ---- <1> Here, the memory array starting at `sb->buf`, and of length `strbuf_avail(sb)` is all yours, and you can be sure that `strbuf_avail(sb)` is at least `SOME_SIZE`. + NOTE: `SOME_OTHER_SIZE` must be smaller or equal to `strbuf_avail(sb)`. + Doing so is safe, though if it has to be done in many places, adding the missing API to the strbuf module is the way to go. + WARNING: Do _not_ assume that the area that is yours is of size `alloc - 1` even if it's true in the current implementation. Alloc is somehow a "private" member that should not be messed with. Use `strbuf_avail()` instead. Data structures --------------- * `struct strbuf` This is the string buffer structure. The `len` member can be used to determine the current length of the string, and `buf` member provides access to the string itself. Functions --------- * Life cycle `strbuf_init`:: Initialize the structure. The second parameter can be zero or a bigger number to allocate memory, in case you want to prevent further reallocs. `strbuf_release`:: Release a string buffer and the memory it used. You should not use the string buffer after using this function, unless you initialize it again. `strbuf_detach`:: Detach the string from the strbuf and returns it; you now own the storage the string occupies and it is your responsibility from then on to release it with `free(3)` when you are done with it. `strbuf_attach`:: Attach a string to a buffer. You should specify the string to attach, the current length of the string and the amount of allocated memory. The amount must be larger than the string length, because the string you pass is supposed to be a NUL-terminated string. This string _must_ be malloc()ed, and after attaching, the pointer cannot be relied upon anymore, and neither be free()d directly. * Related to the size of the buffer `strbuf_avail`:: Determine the amount of allocated but unused memory. `strbuf_grow`:: Ensure that at least this amount of unused memory is available after `len`. This is used when you know a typical size for what you will add and want to avoid repetitive automatic resizing of the underlying buffer. This is never a needed operation, but can be critical for performance in some cases. `strbuf_setlen`:: Set the length of the buffer to a given value. This function does *not* allocate new memory, so you should not perform a `strbuf_setlen()` to a length that is larger than `len + strbuf_avail()`. `strbuf_setlen()` is just meant as a 'please fix invariants from this strbuf I just messed with'. `strbuf_reset`:: Empty the buffer by setting the size of it to zero. * Related to the contents of the buffer `strbuf_rtrim`:: Strip whitespace from the end of a string. * Adding data to the buffer NOTE: All of the functions in this section will grow the buffer as necessary. If they fail for some reason other than memory shortage and the buffer hadn't been allocated before (i.e. the `struct strbuf` was set to `STRBUF_INIT`), then they will free() it. `strbuf_addch`:: Add a single character to the buffer. `strbuf_insert`:: Insert data to the given position of the buffer. The remaining contents will be shifted, not overwritten. `strbuf_remove`:: Remove given amount of data from a given position of the buffer. `strbuf_splice`:: Remove the bytes between `pos..pos+len` and replace it with the given data. `strbuf_add`:: Add data of given length to the buffer. `strbuf_addstr`:: Add a NUL-terminated string to the buffer. + NOTE: This function will *always* be implemented as an inline or a macro that expands to: + ---- strbuf_add(..., s, strlen(s)); ---- + Meaning that this is efficient to write things like: + ---- strbuf_addstr(sb, "immediate string"); ---- `strbuf_addbuf`:: Copy the contents of an other buffer at the end of the current one. `strbuf_addf`:: Add a formatted string to the buffer. `strbuf_fread`:: Read a given size of data from a FILE* pointer to the buffer. + NOTE: The buffer is rewound if the read fails. If -1 is returned, `errno` must be consulted, like you would do for `read(3)`. `strbuf_read()`, `strbuf_read_file()` and `strbuf_getline()` has the same behaviour as well. `strbuf_read`:: Read the contents of a given file descriptor. The third argument can be used to give a hint about the file size, to avoid reallocs. `strbuf_getline`:: Read a line from a FILE *, overwriting the existing contents of the strbuf. The second argument specifies the line terminator character, typically `'\n'`. Reading stops after the terminator or at EOF. The terminator is removed from the buffer before returning. Returns 0 unless there was nothing left before EOF, in which case it returns `EOF`. `strbuf_copyout`:: Copy the contents of the strbuf to the second argument 'buf'. The number of bytes to be copied is at most the third argument 'len'. `strbuf_stripout`:: Strip out the contents of the strbuf to the second argument 'buf'. The number of bytes to be copied is at most the third argument 'len'. sheepdog-0.7.5/doc/farm-internal.txt000066400000000000000000000114521223630776600174050ustar00rootroot00000000000000 ================== Farm Store ================== Liu Yuan Taobao Inc. 1. OVERVIEW Farm is an object store for Sheepdog on node basis. It consists of backend store, which caches the snapshot objects, and working directory, storing objects that Sheepdog currently operates. That being said, the I/O performance for VM Guests would be practically the same as Simple Store. [*] [*] Simple Store is an older storage backend which has been removed from the tree. Snapshots are triggered either by system recovery code or users, and Farm is supposed to restore all the object states into the ones at the time of the user snapshot being taken. Snapshot object in the context means both meta object and data object. 2. DESIGN Simply put, Farm somewhat resembles git a lot (both code and idea level). there are three object type, named 'data, trunk, snapshot[*]' that is similar to git's 'blob, tree, commit'. [*] shorten to 'snap' below. 'data' object is just Sheepdog's I/O object, only named by its sha1-ed content. So the data objects with the same content will be mapped to only single sha1 file, thus achieve node-wide data sharing. 'trunk' object ties data objects together into a flat directory structure at the time of the snapshot being taken. The trunk object provides a means to find old data objects in the store. 'snap' object describes the snapshot, either initiated by users or triggered by recovery code. The snap object refers to one of the trunk objects. The two snap log files provides a means to name the desired snap object. All the objects are depicted in the context of snapshotting or retrieving old data from the snapshotted objects, that is, those objects are 'cached' into Farm store by performing snapshot operations. 2. OBJECT LAYOUT All the objects(snap, trunk, data) in the Farm is based on the operations of the sha1_file. sha1_file provides us compressed and consistency-aware characteristics independent of content or the type of the object. The object successfully inflates to a stream of bytes that forms a sequence of + | | header payload The payload of the data object is the compressed content of Sheepdog's I/O object. For trunk object, the compressed content is struct trunk_entry { uint64_t oid; unsigned char sha1[SHA1_LEN]; }; For snap object, the compressed content is + As for snap operations, besides snap object, Farm has two log files with the below structure struct snap_log { uint32_t epoch; uint64_t time; unsigned char sha1[SHA1_LEN]; }; This provides an internal naming mechanism and help us find snap objects by epoch. 3. STALE OBJECT For storing one object into backend store when the snapshot is taken, either a) no content change, then point to the same old sha1_file (no stale object) or b) content updated, then will point to a new object with a new sha1. We need to remove stale object in case b), only in the assumption that it is the object generated by recovery code. [*] When we try store new snapshot object into the backend store, it is safe and good timing for us to remove the old object with the same object ID. For user snapshot objects, we don't need to remove them until the snapshot is deleted. [*] Here I assume we don't need to restore to 'sys epoch' state. 4. FLOW FIGURE sys_snap, user_snap snapshot requests | | |put/get snap_sha1 | trigger v | +----------+ +------+ +--------+ v +----------+ | |<------>| snap |<++++++>| | <========> | | | | +------+ | | | Farm | | | | trunk | | Working | I/O +-------+ | |<---------------------->| | | Directory| <~~~~~~>|sheep | | Farm | +--------+ | | +-------+ | Backend | | | | Store | | | | |<-------------------------------------------->| | | | | | +----------+ +----------+ <-----> put/get objects to/from Farm Store <+++++> put/get trunk_sha1 to/from snap object <=====> put/get oid/oid_sha1 pairs to/from trunk object sheepdog-0.7.5/dog/000077500000000000000000000000001223630776600141065ustar00rootroot00000000000000sheepdog-0.7.5/dog/Makefile.am000066400000000000000000000031211223630776600161370ustar00rootroot00000000000000# # Copyright 2010 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; see the file COPYING. If not, write to # the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. # MAINTAINERCLEANFILES = Makefile.in AM_CFLAGS = AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include sbin_PROGRAMS = dog dog_SOURCES = farm/object_tree.c farm/sha1_file.c farm/snap.c \ farm/trunk.c farm/farm.c farm/slice.c \ dog.c common.c treeview.c vdi.c node.c cluster.c if BUILD_TRACE dog_SOURCES += trace.c override CFLAGS := $(subst -pg,,$(CFLAGS)) endif dog_LDADD = ../lib/libsheepdog.a -lpthread dog_DEPENDENCIES = ../lib/libsheepdog.a noinst_HEADERS = treeview.h dog.h farm/farm.h EXTRA_DIST = install-exec-hook: if [ -z "${DESTDIR}" ];then $(LN_S) -f ${sbindir}/dog ${sbindir}/collie;fi uninstall-hook: rm -f ${sbindir}/collie all-local: @echo Built dog clean-local: rm -f dog *.o gmon.out *.da *.bb *.bbg # support for GNU Flymake check-syntax: $(COMPILE) -fsyntax-only $(CHK_SOURCES) check-style: @$(CHECK_STYLE) $(dog_SOURCES) $(noinst_HEADERS) coverage: @lcov -d . -c -o dog.info sheepdog-0.7.5/dog/cluster.c000066400000000000000000000304431223630776600157370ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include "dog.h" #include "farm/farm.h" static struct sd_option cluster_options[] = { {'b', "store", true, "specify backend store"}, {'c', "copies", true, "specify the default data redundancy (number of copies)"}, {'f', "force", false, "do not prompt for confirmation"}, { 0, NULL, false, NULL }, }; static struct cluster_cmd_data { int copies; bool force; char name[STORE_LEN]; } cluster_cmd_data; #define DEFAULT_STORE "plain" static int list_store(void) { int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char buf[512] = { 0 }; sd_init_req(&hdr, SD_OP_GET_STORE_LIST); hdr.data_length = 512; ret = dog_exec_req(sdhost, sdport, &hdr, buf); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("Restore failed: %s", sd_strerror(rsp->result)); return EXIT_FAILURE; } printf("Available stores:\n"); printf("---------------------------------------\n"); printf("%s\n", buf); return EXIT_SYSFAIL; } static bool no_vdi(const unsigned long *vdis) { return find_next_bit(vdis, SD_NR_VDIS, 0) == SD_NR_VDIS; } #define FORMAT_PRINT \ " __\n" \ " ()'`;\n" \ " /\\|`\n" \ " / | Caution! The cluster is not empty.\n" \ "(/_)_|_ Are you sure you want to continue? [yes/no]: " static int cluster_format(int argc, char **argv) { int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct timeval tv; char store_name[STORE_LEN]; static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); sd_init_req(&hdr, SD_OP_READ_VDIS); hdr.data_length = sizeof(vdi_inuse); ret = dog_exec_req(sdhost, sdport, &hdr, &vdi_inuse); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("%s", sd_strerror(rsp->result)); return EXIT_FAILURE; } if (!no_vdi(vdi_inuse)) confirm(FORMAT_PRINT); gettimeofday(&tv, NULL); sd_init_req(&hdr, SD_OP_MAKE_FS); hdr.cluster.copies = cluster_cmd_data.copies; hdr.cluster.ctime = (uint64_t) tv.tv_sec << 32 | tv.tv_usec * 1000; if (strlen(cluster_cmd_data.name)) pstrcpy(store_name, STORE_LEN, cluster_cmd_data.name); else pstrcpy(store_name, STORE_LEN, DEFAULT_STORE); hdr.data_length = strlen(store_name) + 1; hdr.flags |= SD_FLAG_CMD_WRITE; printf("using backend %s store\n", store_name); ret = dog_exec_req(sdhost, sdport, &hdr, store_name); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("Format failed: %s", sd_strerror(rsp->result)); if (rsp->result == SD_RES_NO_STORE) return list_store(); else return EXIT_SYSFAIL; } return EXIT_SUCCESS; } static int cluster_info(int argc, char **argv) { int i, ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct epoch_log *logs; int nr_logs, log_length; time_t ti, ct; struct tm tm; char time_str[128]; log_length = sd_epoch * sizeof(struct epoch_log); logs = xmalloc(log_length); sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; ret = dog_exec_req(sdhost, sdport, &hdr, logs); if (ret < 0) goto error; if (!raw_output) printf("Cluster status: "); if (rsp->result == SD_RES_SUCCESS) printf("running, auto-recovery %s\n", logs->disable_recovery ? "disabled" : "enabled"); else printf("%s\n", sd_strerror(rsp->result)); if (!raw_output && rsp->data_length > 0) { ct = logs[0].ctime >> 32; printf("\nCluster created at %s\n", ctime(&ct)); printf("Epoch Time Version\n"); } nr_logs = rsp->data_length / sizeof(struct epoch_log); for (i = 0; i < nr_logs; i++) { int j; const struct sd_node *entry; ti = logs[i].time; if (raw_output) { snprintf(time_str, sizeof(time_str), "%" PRIu64, (uint64_t) ti); } else { localtime_r(&ti, &tm); strftime(time_str, sizeof(time_str), "%Y-%m-%d %H:%M:%S", &tm); } printf(raw_output ? "%s %d" : "%s %6d", time_str, logs[i].epoch); printf(" ["); for (j = 0; j < logs[i].nr_nodes; j++) { entry = logs[i].nodes + j; printf("%s%s", (j == 0) ? "" : ", ", addr_to_str(entry->nid.addr, entry->nid.port)); } printf("]\n"); } free(logs); return EXIT_SUCCESS; error: free(logs); return EXIT_SYSFAIL; } static int cluster_shutdown(int argc, char **argv) { int ret; struct sd_req hdr; sd_init_req(&hdr, SD_OP_SHUTDOWN); ret = send_light_req(&hdr, sdhost, sdport); if (ret) { sd_err("failed to execute request"); return EXIT_FAILURE; } return EXIT_SUCCESS; } static void print_list(void *buf, unsigned len) { struct snap_log *log_buf = (struct snap_log *)buf; unsigned nr = len / sizeof(struct snap_log); printf("Index\t\tTag\t\tSnapshot Time\n"); for (unsigned i = 0; i < nr; i++, log_buf++) { time_t *t = (time_t *)&log_buf->time; printf("%d\t\t", log_buf->idx); printf("%s\t\t", log_buf->tag); printf("%s", ctime(t)); } } static int list_snapshot(int argc, char **argv) { char *path = argv[optind++]; void *buf = NULL; int log_nr; int ret = EXIT_SYSFAIL; if (farm_init(path) != SD_RES_SUCCESS) goto out; buf = snap_log_read(&log_nr); if (!buf) goto out; print_list(buf, log_nr * sizeof(struct snap_log)); ret = EXIT_SUCCESS; out: if (ret) sd_err("Fail to list snapshot."); free(buf); return ret; } static void fill_object_tree(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { uint64_t vdi_oid = vid_to_vdi_oid(vid), vmstate_oid; int nr_vmstate_object; /* ignore active vdi */ if (!vdi_is_snapshot(i)) return; /* fill vdi object id */ object_tree_insert(vdi_oid, i->nr_copies); /* fill data object id */ for (uint64_t idx = 0; idx < MAX_DATA_OBJS; idx++) { if (i->data_vdi_id[idx]) { uint64_t oid = vid_to_data_oid(i->data_vdi_id[idx], idx); object_tree_insert(oid, i->nr_copies); } } /* fill vmstate object id */ nr_vmstate_object = DIV_ROUND_UP(i->vm_state_size, SD_DATA_OBJ_SIZE); for (int idx = 0; idx < nr_vmstate_object; idx++) { vmstate_oid = vid_to_vmstate_oid(vid, idx); object_tree_insert(vmstate_oid, i->nr_copies); } } static int save_snapshot(int argc, char **argv) { char *tag = argv[optind++]; char *path, *p; int ret = EXIT_SYSFAIL, uninitialized_var(unused); unused = strtol(tag, &p, 10); if (tag != p) { sd_err("Tag should not start with number."); return EXIT_USAGE; } if (!argv[optind]) { sd_err("Please specify the path to save snapshot."); return EXIT_USAGE; } path = argv[optind]; if (farm_init(path) != SD_RES_SUCCESS) goto out; if (farm_contain_snapshot(0, tag)) { sd_err("Snapshot tag has already been used for another" " snapshot, please, use another one."); goto out; } if (parse_vdi(fill_object_tree, SD_INODE_SIZE, NULL) != SD_RES_SUCCESS) goto out; if (farm_save_snapshot(tag) != SD_RES_SUCCESS) goto out; ret = EXIT_SUCCESS; out: if (ret) sd_err("Fail to save snapshot to path: %s.", path); object_tree_free(); return ret; } static int load_snapshot(int argc, char **argv) { char *tag = argv[optind++]; char *path, *p; uint32_t idx; int ret = EXIT_SYSFAIL; idx = strtol(tag, &p, 10); if (tag == p) idx = 0; if (!argv[optind]) { sd_err("Please specify the path to save snapshot."); return EXIT_USAGE; } path = argv[optind]; if (farm_init(path) != SD_RES_SUCCESS) goto out; if (!farm_contain_snapshot(idx, tag)) { sd_err("Snapshot index or tag does not exist."); goto out; } if (cluster_format(0, NULL) != SD_RES_SUCCESS) goto out; if (farm_load_snapshot(idx, tag) != SD_RES_SUCCESS) goto out; ret = EXIT_SUCCESS; out: if (ret) sd_err("Fail to load snapshot"); return ret; } #define RECOVER_PRINT \ "Caution! Please try starting all the cluster nodes normally before\n" \ "running this command.\n\n" \ "The cluster may need to be force recovered if:\n" \ " - the master node fails to start because of epoch mismatch; or\n" \ " - some nodes fail to start after a cluster shutdown.\n\n" \ "Are you sure you want to continue? [yes/no]: " static int cluster_force_recover(int argc, char **argv) { int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char str[123] = {'\0'}; struct sd_node nodes[SD_MAX_NODES]; if (!cluster_cmd_data.force) { int i, l; printf(RECOVER_PRINT); ret = scanf("%s", str); if (ret < 0) return EXIT_SYSFAIL; l = strlen(str); for (i = 0; i < l; i++) str[i] = tolower(str[i]); if (strncmp(str, "yes", 3) != 0) return EXIT_SUCCESS; } sd_init_req(&hdr, SD_OP_FORCE_RECOVER); hdr.data_length = sizeof(nodes); ret = dog_exec_req(sdhost, sdport, &hdr, nodes); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("failed to execute request, %s", sd_strerror(rsp->result)); return EXIT_FAILURE; } return EXIT_SUCCESS; } static int cluster_disable_recover(int argc, char **argv) { int ret; struct sd_req hdr; sd_init_req(&hdr, SD_OP_DISABLE_RECOVER); ret = send_light_req(&hdr, sdhost, sdport); if (ret) return EXIT_FAILURE; printf("Cluster recovery: disable\n"); return EXIT_SUCCESS; } static int cluster_enable_recover(int argc, char **argv) { int ret; struct sd_req hdr; sd_init_req(&hdr, SD_OP_ENABLE_RECOVER); ret = send_light_req(&hdr, sdhost, sdport); if (ret) return EXIT_FAILURE; printf("Cluster recovery: enable\n"); return EXIT_SUCCESS; } /* Subcommand list of recover */ static struct subcommand cluster_recover_cmd[] = { {"force", NULL, NULL, "force recover cluster immediately", NULL, 0, cluster_force_recover}, {"enable", NULL, NULL, "enable automatic recovery and " "run once recover if necessary", NULL, 0, cluster_enable_recover}, {"disable", NULL, NULL, "disable automatic recovery", NULL, 0, cluster_disable_recover}, {NULL}, }; static int cluster_recover(int argc, char **argv) { return do_generic_subcommand(cluster_recover_cmd, argc, argv); } /* Subcommand list of snapshot */ static struct subcommand cluster_snapshot_cmd[] = { {"save", NULL, "h", "save snapshot to localpath", NULL, CMD_NEED_ARG|CMD_NEED_NODELIST, save_snapshot, NULL}, {"list", NULL, "h", "list snapshot of localpath", NULL, CMD_NEED_ARG, list_snapshot, NULL}, {"load", NULL, "h", "load snapshot from localpath", NULL, CMD_NEED_ARG, load_snapshot, NULL}, {NULL}, }; static int cluster_snapshot(int argc, char **argv) { return do_generic_subcommand(cluster_snapshot_cmd, argc, argv); } static int cluster_reweight(int argc, char **argv) { int ret; struct sd_req hdr; sd_init_req(&hdr, SD_OP_REWEIGHT); ret = send_light_req(&hdr, sdhost, sdport); if (ret) return EXIT_FAILURE; return EXIT_SUCCESS; } static struct subcommand cluster_cmd[] = { {"info", NULL, "aprh", "show cluster information", NULL, CMD_NEED_NODELIST, cluster_info, cluster_options}, {"format", NULL, "bcaph", "create a Sheepdog store", NULL, 0, cluster_format, cluster_options}, {"shutdown", NULL, "aph", "stop Sheepdog", NULL, 0, cluster_shutdown, cluster_options}, {"snapshot", " ", "aph", "snapshot/restore the cluster", cluster_snapshot_cmd, CMD_NEED_ARG, cluster_snapshot, cluster_options}, {"recover", NULL, "afph", "See 'dog cluster recover' for more information", cluster_recover_cmd, CMD_NEED_ARG, cluster_recover, cluster_options}, {"reweight", NULL, "aph", "reweight the cluster", NULL, 0, cluster_reweight, cluster_options}, {NULL,}, }; static int cluster_parser(int ch, char *opt) { int copies; char *p; switch (ch) { case 'b': pstrcpy(cluster_cmd_data.name, sizeof(cluster_cmd_data.name), opt); break; case 'c': copies = strtol(opt, &p, 10); if (opt == p || copies < 1) { sd_err("There must be at least one copy of data"); exit(EXIT_FAILURE); } else if (copies > SD_MAX_COPIES) { sd_err("Redundancy may not exceed %d copies", SD_MAX_COPIES); exit(EXIT_FAILURE); } cluster_cmd_data.copies = copies; break; case 'f': cluster_cmd_data.force = true; break; } return 0; } struct command cluster_command = { "cluster", cluster_cmd, cluster_parser }; sheepdog-0.7.5/dog/common.c000066400000000000000000000166421223630776600155530ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "dog.h" #include "sha1.h" #include "sockfd_cache.h" char *size_to_str(uint64_t _size, char *str, int str_size) { const char *units[] = {"MB", "GB", "TB", "PB", "EB", "ZB", "YB"}; int i = 0; double size; if (raw_output) { snprintf(str, str_size, "%" PRIu64, _size); return str; } size = (double)_size; size /= 1024 * 1024; while (i < ARRAY_SIZE(units) - 1 && size >= 1024) { i++; size /= 1024; } if (size >= 10) snprintf(str, str_size, "%.0lf %s", size, units[i]); else snprintf(str, str_size, "%.1lf %s", size, units[i]); return str; } int sd_read_object(uint64_t oid, void *data, unsigned int datalen, uint64_t offset, bool direct) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; sd_init_req(&hdr, SD_OP_READ_OBJ); hdr.data_length = datalen; hdr.obj.oid = oid; hdr.obj.offset = offset; if (direct) hdr.flags |= SD_FLAG_CMD_DIRECT; ret = dog_exec_req(sdhost, sdport, &hdr, data); if (ret < 0) { sd_err("Failed to read object %" PRIx64, oid); return SD_RES_EIO; } if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to read object %" PRIx64 " %s", oid, sd_strerror(rsp->result)); return rsp->result; } untrim_zero_blocks(data, rsp->obj.offset, rsp->data_length, datalen); return SD_RES_SUCCESS; } int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data, unsigned int datalen, uint64_t offset, uint32_t flags, int copies, bool create, bool direct) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; if (create) sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); else sd_init_req(&hdr, SD_OP_WRITE_OBJ); hdr.data_length = datalen; hdr.flags = flags | SD_FLAG_CMD_WRITE; if (cow_oid) hdr.flags |= SD_FLAG_CMD_COW; if (direct) hdr.flags |= SD_FLAG_CMD_DIRECT; hdr.obj.copies = copies; hdr.obj.oid = oid; hdr.obj.cow_oid = cow_oid; hdr.obj.offset = offset; ret = dog_exec_req(sdhost, sdport, &hdr, data); if (ret < 0) { sd_err("Failed to write object %" PRIx64, oid); return SD_RES_EIO; } if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to write object %" PRIx64 ": %s", oid, sd_strerror(rsp->result)); return rsp->result; } return SD_RES_SUCCESS; } #define FOR_EACH_VDI(nr, vdis) FOR_EACH_BIT(nr, vdis, SD_NR_VDIS) int parse_vdi(vdi_parser_func_t func, size_t size, void *data) { int ret; unsigned long nr; static struct sd_inode i; struct sd_req req; struct sd_rsp *rsp = (struct sd_rsp *)&req; static DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); unsigned int rlen = sizeof(vdi_inuse); sd_init_req(&req, SD_OP_READ_VDIS); req.data_length = sizeof(vdi_inuse); ret = dog_exec_req(sdhost, sdport, &req, &vdi_inuse); if (ret < 0) goto out; if (rsp->result != SD_RES_SUCCESS) { sd_err("%s", sd_strerror(rsp->result)); goto out; } FOR_EACH_VDI(nr, vdi_inuse) { uint64_t oid; uint32_t snapid; oid = vid_to_vdi_oid(nr); memset(&i, 0, sizeof(i)); ret = sd_read_object(oid, &i, SD_INODE_HEADER_SIZE, 0, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read inode header"); continue; } if (i.name[0] == '\0') /* this VDI has been deleted */ continue; if (size > SD_INODE_HEADER_SIZE) { rlen = DIV_ROUND_UP(i.vdi_size, SD_DATA_OBJ_SIZE) * sizeof(i.data_vdi_id[0]); if (rlen > size - SD_INODE_HEADER_SIZE) rlen = size - SD_INODE_HEADER_SIZE; ret = sd_read_object(oid, ((char *)&i) + SD_INODE_HEADER_SIZE, rlen, SD_INODE_HEADER_SIZE, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read inode"); continue; } } snapid = vdi_is_snapshot(&i) ? i.snap_id : 0; func(i.vdi_id, i.name, i.tag, snapid, 0, &i, data); } out: return ret; } int dog_exec_req(const uint8_t *addr, int port, struct sd_req *hdr, void *buf) { struct node_id nid = {}; struct sockfd *sfd; int ret; memcpy(nid.addr, addr, sizeof(nid.addr)); nid.port = port; sfd = sockfd_cache_get(&nid); if (!sfd) return -1; /* * Retry forever for dog because * 1. We can't get the newest epoch * 2. Some operations might take unexpected long time */ ret = exec_req(sfd->fd, hdr, buf, NULL, 0, UINT32_MAX); sockfd_cache_put(&nid, sfd); return ret ? -1 : 0; } /* Light request only contains header, without body content. */ int send_light_req(struct sd_req *hdr, const uint8_t *addr, int port) { int ret = dog_exec_req(addr, port, hdr, NULL); struct sd_rsp *rsp = (struct sd_rsp *)hdr; if (ret == -1) return -1; if (rsp->result != SD_RES_SUCCESS) { sd_err("Response's result: %s", sd_strerror(rsp->result)); return -1; } return 0; } int do_generic_subcommand(struct subcommand *sub, int argc, char **argv) { int i, ret; for (i = 0; sub[i].name; i++) { if (!strcmp(sub[i].name, argv[optind])) { unsigned long flags = sub[i].flags; if (flags & CMD_NEED_NODELIST) { ret = update_node_list(SD_MAX_NODES); if (ret < 0) { sd_err("Failed to get node list"); exit(EXIT_SYSFAIL); } } if (flags & CMD_NEED_ARG && argc < 5) subcommand_usage(argv[1], argv[2], EXIT_USAGE); optind++; ret = sub[i].fn(argc, argv); if (ret == EXIT_USAGE) subcommand_usage(argv[1], argv[2], EXIT_USAGE); return ret; } } subcommand_usage(argv[1], argv[2], EXIT_FAILURE); return EXIT_FAILURE; } void confirm(const char *message) { char input[8] = ""; char *ret; printf("%s", message); ret = fgets(input, sizeof(input), stdin); if (ret == NULL || strncasecmp(input, "yes", 3) != 0) exit(EXIT_SUCCESS); } void work_queue_wait(struct work_queue *q) { while (!work_queue_empty(q)) event_loop(-1); } #define DEFAULT_SCREEN_WIDTH 80 static int get_screen_width(void) { struct winsize wsz; if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &wsz) < 0) return DEFAULT_SCREEN_WIDTH; return wsz.ws_col; } /* * Show prograss bar as follows. * * 45.0 % [===============> ] 180 MB / 400 MB */ void show_progress(uint64_t done, uint64_t total, bool raw) { char done_str[256], total_str[256]; int screen_width = get_screen_width(); int bar_length = screen_width - 30; char *buf; if (!is_stdout_console()) return; if (screen_width <= 0) return; printf("\r"); /* move to the beginning of the line */ if (raw) { snprintf(done_str, sizeof(done_str), "%"PRIu64, done); snprintf(total_str, sizeof(total_str), "%"PRIu64, total); } else { size_to_str(done, done_str, sizeof(done_str)); size_to_str(total, total_str, sizeof(total_str)); } buf = xmalloc(screen_width + 1); snprintf(buf, screen_width, "%5.1lf %% [", (double)done / total * 100); for (int i = 0; i < bar_length; i++) { if (total * (i + 1) / bar_length <= done) strcat(buf, "="); else if (total * i / bar_length <= done && done < total * (i + 1) / bar_length) strcat(buf, ">"); else strcat(buf, " "); } snprintf(buf + strlen(buf), screen_width - strlen(buf), "] %s / %s", done_str, total_str); /* fill the rest of buffer with blank characters */ memset(buf + strlen(buf), ' ', screen_width - strlen(buf)); buf[screen_width] = '\0'; printf("%s", buf); if (done == total) printf("\n"); fflush(stdout); free(buf); } sheepdog-0.7.5/dog/dog.c000066400000000000000000000227251223630776600150330ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include "sheepdog_proto.h" #include "sheep.h" #include "dog.h" #include "util.h" #include "sockfd_cache.h" #define EPOLL_SIZE 4096 static const char program_name[] = "dog"; /* default sdhost is "127.0.0.1" */ uint8_t sdhost[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, 0, 0, 1 }; int sdport = SD_LISTEN_PORT; bool highlight = true; bool raw_output; bool verbose; static const struct sd_option dog_options[] = { /* common options for all dog commands */ {'a', "address", true, "specify the daemon address (default: localhost)"}, {'p', "port", true, "specify the daemon port"}, {'r', "raw", false, "raw output mode: omit headers, separate fields with\n" " single spaces and print all sizes in decimal bytes"}, {'v', "verbose", false, "print more information than default"}, {'h', "help", false, "display this help and exit"}, { 0, NULL, false, NULL }, }; static void usage(const struct command *commands, int status); uint32_t sd_epoch; struct sd_node sd_nodes[SD_MAX_NODES]; struct sd_vnode sd_vnodes[SD_MAX_VNODES]; int sd_nodes_nr, sd_vnodes_nr; int update_node_list(int max_nodes) { int ret; unsigned int size; char *buf = NULL; struct sd_node *ent; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; size = sizeof(*ent) * max_nodes; buf = xzalloc(size); sd_init_req(&hdr, SD_OP_GET_NODE_LIST); hdr.data_length = size; ret = dog_exec_req(sdhost, sdport, &hdr, buf); if (ret < 0) goto out; if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to update node list: %s", sd_strerror(rsp->result)); ret = -1; goto out; } size = rsp->data_length; sd_nodes_nr = size / sizeof(*ent); if (sd_nodes_nr == 0) { sd_err("There are no active sheep daemons"); exit(EXIT_FAILURE); } /* FIXME */ if (sd_nodes_nr > max_nodes) { ret = -1; goto out; } memcpy(sd_nodes, buf, size); sd_vnodes_nr = nodes_to_vnodes(sd_nodes, sd_nodes_nr, sd_vnodes); sd_epoch = hdr.epoch; out: if (buf) free(buf); return ret; } static int (*command_parser)(int, char *); static int (*command_fn)(int, char **); static const char *command_opts; static const char *command_arg; static const char *command_desc; static struct sd_option *command_options; static const struct sd_option *find_opt(int ch) { const struct sd_option *opt; /* search for common options */ sd_for_each_option(opt, dog_options) { if (opt->ch == ch) return opt; } /* search for self options */ if (command_options) { sd_for_each_option(opt, command_options) { if (opt->ch == ch) return opt; } } sd_err("Internal error"); exit(EXIT_SYSFAIL); } static void init_commands(const struct command **commands) { static struct command *cmds; struct command command_list[] = { vdi_command, node_command, cluster_command, trace_command, {NULL,} }; if (!cmds) { cmds = (struct command *)xmalloc(sizeof(command_list)); memcpy(cmds, command_list, sizeof(command_list)); } *commands = cmds; return; } static const struct subcommand *find_subcmd(const char *cmd, const char *subcmd) { int i, j; const struct command *commands; const struct subcommand *sub; init_commands(&commands); for (i = 0; commands[i].name; i++) { if (!strcmp(commands[i].name, cmd)) { sub = commands[i].sub; for (j = 0; sub[j].name; j++) { if (!strcmp(sub[j].name, subcmd)) return &sub[j]; } } } return NULL; } static unsigned long setup_commands(const struct command *commands, char *cmd, char *subcmd) { int i; bool found = false; struct subcommand *s; unsigned long flags = 0; for (i = 0; commands[i].name; i++) { if (!strcmp(commands[i].name, cmd)) { found = true; if (commands[i].parser) command_parser = commands[i].parser; break; } } if (!found) { if (cmd && strcmp(cmd, "help") && strcmp(cmd, "--help") && strcmp(cmd, "-h")) { sd_err("Invalid command '%s'", cmd); usage(commands, EXIT_USAGE); } usage(commands, 0); } for (s = commands[i].sub; subcmd && s->name; s++) { if (!strcmp(s->name, subcmd)) { command_fn = s->fn; command_opts = s->opts; command_arg = s->arg; command_desc = s->desc; command_options = s->options; flags = s->flags; break; } } if (!command_fn) { if (subcmd && strcmp(subcmd, "help") && strcmp(subcmd, "--help") && strcmp(subcmd, "-h")) sd_err("Invalid command '%s %s'", cmd, subcmd); sd_err("Available %s commands:", cmd); for (s = commands[i].sub; s->name; s++) sd_err(" %s %s", cmd, s->name); exit(EXIT_USAGE); } return flags; } static void usage(const struct command *commands, int status) { int i; struct subcommand *s; char name[64]; if (status) sd_err("Try '%s --help' for more information.", program_name); else { printf("Sheepdog administrator utility\n"); printf("Usage: %s [options]\n", program_name); printf("\nAvailable commands:\n"); for (i = 0; commands[i].name; i++) { for (s = commands[i].sub; s->name; s++) { snprintf(name, sizeof(name), "%s %s", commands[i].name, s->name); printf(" %-24s%s\n", name, s->desc); } } printf("\n"); printf("For more information, run " "'%s --help'.\n", program_name); } exit(status); } void subcommand_usage(char *cmd, char *subcmd, int status) { int i, n, len = strlen(command_opts); const struct sd_option *sd_opt; const struct subcommand *sub, *subsub; char name[64]; printf("Usage: %s %s %s", program_name, cmd, subcmd); /* Show subcmd's subcommands if necessary */ sub = find_subcmd(cmd, subcmd); subsub = sub->sub; if (subsub) { n = 0; while (subsub[n].name) n++; if (n == 1) printf(" %s", subsub[0].name); else if (n > 1) { printf(" {%s", subsub[0].name); for (i = 1; i < n; i++) printf("|%s", subsub[i].name); printf("}"); } } for (i = 0; i < len; i++) { sd_opt = find_opt(command_opts[i]); if (sd_opt->has_arg) printf(" [-%c %s]", sd_opt->ch, sd_opt->name); else printf(" [-%c]", sd_opt->ch); } if (command_arg) printf(" %s", command_arg); printf("\n"); if (subsub) { printf("Available subcommands:\n"); for (i = 0; subsub[i].name; i++) printf(" %-24s%s\n", subsub[i].name, subsub[i].desc); } printf("Options:\n"); for (i = 0; i < len; i++) { sd_opt = find_opt(command_opts[i]); snprintf(name, sizeof(name), "-%c, --%s", sd_opt->ch, sd_opt->name); printf(" %-24s%s\n", name, sd_opt->desc); } exit(status); } static const struct sd_option *build_sd_options(const char *opts) { static struct sd_option sd_opts[256], *p; int i, len = strlen(opts); p = sd_opts; for (i = 0; i < len; i++) *p++ = *find_opt(opts[i]); memset(p, 0, sizeof(struct sd_option)); return sd_opts; } static void crash_handler(int signo) { sd_err("dog exits unexpectedly (%s).", strsignal(signo)); sd_backtrace(); /* * OOM raises SIGABRT in xmalloc but the administrator expects * that dog exits with EXIT_SYSFAIL. We have to give up * dumping a core file in this case. */ if (signo == SIGABRT) exit(EXIT_SYSFAIL); reraise_crash_signal(signo, EXIT_SYSFAIL); } static size_t get_nr_nodes(void) { return sd_nodes_nr; } int main(int argc, char **argv) { int ch, longindex, ret; unsigned long flags; struct option *long_options; const struct command *commands; const char *short_options; char *p; const struct sd_option *sd_opts; install_crash_handler(crash_handler); init_commands(&commands); if (argc < 2) usage(commands, 0); flags = setup_commands(commands, argv[1], argv[2]); optind = 3; sd_opts = build_sd_options(command_opts); long_options = build_long_options(sd_opts); short_options = build_short_options(sd_opts); while ((ch = getopt_long(argc, argv, short_options, long_options, &longindex)) >= 0) { switch (ch) { case 'a': if (!str_to_addr(optarg, sdhost)) { sd_err("Invalid ip address %s", optarg); return EXIT_FAILURE; } break; case 'p': sdport = strtol(optarg, &p, 10); if (optarg == p || sdport < 1 || sdport > UINT16_MAX) { sd_err("Invalid port number '%s'", optarg); exit(EXIT_USAGE); } break; case 'r': raw_output = true; break; case 'v': verbose = true; break; case 'h': subcommand_usage(argv[1], argv[2], EXIT_SUCCESS); break; case '?': usage(commands, EXIT_USAGE); break; default: if (command_parser) command_parser(ch, optarg); else usage(commands, EXIT_USAGE); break; } } if (!is_stdout_console() || raw_output) highlight = false; if (flags & CMD_NEED_NODELIST) { ret = update_node_list(SD_MAX_NODES); if (ret < 0) { sd_err("Failed to get node list"); exit(EXIT_SYSFAIL); } } if (flags & CMD_NEED_ARG && argc == optind) subcommand_usage(argv[1], argv[2], EXIT_USAGE); if (init_event(EPOLL_SIZE) < 0) exit(EXIT_SYSFAIL); if (init_work_queue(get_nr_nodes) != 0) { sd_err("Failed to init work queue"); exit(EXIT_SYSFAIL); } if (sockfd_init()) { sd_err("sockfd_init() failed"); exit(EXIT_SYSFAIL); } ret = command_fn(argc, argv); if (ret == EXIT_USAGE) subcommand_usage(argv[1], argv[2], EXIT_USAGE); return ret; } sheepdog-0.7.5/dog/dog.h000066400000000000000000000054141223630776600150340ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __DOG_H__ #define __DOG_H__ #include #include #include #include #include #include #include "sheepdog_proto.h" #include "sheep.h" #include "exits.h" #include "option.h" #include "work.h" #include "event.h" #include "config.h" #define CMD_NEED_NODELIST (1 << 0) #define CMD_NEED_ARG (1 << 1) #define UINT64_DECIMAL_SIZE 21 struct command { const char *name; struct subcommand *sub; int (*parser)(int, char *); }; struct subcommand { const char *name; const char *arg; const char *opts; const char *desc; struct subcommand *sub; unsigned long flags; int (*fn)(int, char **); struct sd_option *options; }; void subcommand_usage(char *cmd, char *subcmd, int status); extern uint8_t sdhost[16]; extern int sdport; extern bool highlight; extern bool raw_output; extern bool verbose; extern uint32_t sd_epoch; extern struct sd_node sd_nodes[SD_MAX_NODES]; extern struct sd_vnode sd_vnodes[SD_MAX_VNODES]; extern int sd_nodes_nr, sd_vnodes_nr; bool is_current(const struct sd_inode *i); char *size_to_str(uint64_t _size, char *str, int str_size); typedef void (*vdi_parser_func_t)(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data); int parse_vdi(vdi_parser_func_t func, size_t size, void *data); int sd_read_object(uint64_t oid, void *data, unsigned int datalen, uint64_t offset, bool direct); int sd_write_object(uint64_t oid, uint64_t cow_oid, void *data, unsigned int datalen, uint64_t offset, uint32_t flags, int copies, bool create, bool direct); int dog_exec_req(const uint8_t *addr, int port, struct sd_req *hdr, void *data); int send_light_req(struct sd_req *hdr, const uint8_t *addr, int port); int do_generic_subcommand(struct subcommand *sub, int argc, char **argv); int update_node_list(int max_nodes); void confirm(const char *message); void work_queue_wait(struct work_queue *q); int do_vdi_create(const char *vdiname, int64_t vdi_size, uint32_t base_vid, uint32_t *vdi_id, bool snapshot, int nr_copies); void show_progress(uint64_t done, uint64_t total, bool raw); extern struct command vdi_command; extern struct command node_command; extern struct command cluster_command; #ifdef HAVE_TRACE extern struct command trace_command; #else #define trace_command {} #endif /* HAVE_TRACE */ #endif sheepdog-0.7.5/dog/farm/000077500000000000000000000000001223630776600150335ustar00rootroot00000000000000sheepdog-0.7.5/dog/farm/farm.c000066400000000000000000000214351223630776600161310ustar00rootroot00000000000000/* * Copyright (C) 2011 Taobao Inc. * Copyright (C) 2013 Zelin.io * * Liu Yuan * Kai Zhang * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "farm.h" #include "list.h" static char farm_object_dir[PATH_MAX]; static char farm_dir[PATH_MAX]; static struct sd_lock vdi_list_lock = SD_LOCK_INITIALIZER; struct vdi_entry { char name[SD_MAX_VDI_LEN]; uint64_t vdi_size; uint32_t vdi_id; uint32_t snap_id; uint8_t nr_copies; struct list_head list; }; static LIST_HEAD(last_vdi_list); struct snapshot_work { struct trunk_entry entry; struct strbuf *trunk_buf; struct work work; }; static struct work_queue *wq; static uatomic_bool work_error; static struct vdi_entry *find_vdi(const char *name) { struct vdi_entry *vdi; list_for_each_entry(vdi, &last_vdi_list, list) { if (!strcmp(vdi->name, name)) return vdi; } return NULL; } static struct vdi_entry *new_vdi(const char *name, uint64_t vdi_size, uint32_t vdi_id, uint32_t snap_id, uint8_t nr_copies) { struct vdi_entry *vdi; vdi = xmalloc(sizeof(struct vdi_entry)); pstrcpy(vdi->name, sizeof(vdi->name), name); vdi->vdi_size = vdi_size; vdi->vdi_id = vdi_id; vdi->snap_id = snap_id; vdi->nr_copies = nr_copies; INIT_LIST_HEAD(&vdi->list); return vdi; } static void insert_vdi(struct sd_inode *new) { struct vdi_entry *vdi; vdi = find_vdi(new->name); if (!vdi) { vdi = new_vdi(new->name, new->vdi_size, new->vdi_id, new->snap_id, new->nr_copies); list_add(&vdi->list, &last_vdi_list); } else if (vdi->snap_id < new->snap_id) { vdi->vdi_size = new->vdi_size; vdi->vdi_id = new->vdi_id; vdi->snap_id = new->snap_id; vdi->nr_copies = new->nr_copies; } } static int create_active_vdis(void) { struct vdi_entry *vdi; uint32_t new_vid; list_for_each_entry(vdi, &last_vdi_list, list) { if (do_vdi_create(vdi->name, vdi->vdi_size, vdi->vdi_id, &new_vid, false, vdi->nr_copies) < 0) return -1; } return 0; } static void free_vdi_list(void) { struct vdi_entry *vdi, *next; list_for_each_entry_safe(vdi, next, &last_vdi_list, list) free(vdi); } char *get_object_directory(void) { return farm_object_dir; } static int create_directory(const char *p) { int ret = -1; struct strbuf buf = STRBUF_INIT; strbuf_addstr(&buf, p); if (xmkdir(buf.buf, 0755) < 0) { if (errno == EEXIST) sd_err("Path is not a directory: %s", p); goto out; } if (!strlen(farm_dir)) strbuf_copyout(&buf, farm_dir, sizeof(farm_dir)); strbuf_addstr(&buf, "/objects"); if (xmkdir(buf.buf, 0755) < 0) goto out; for (int i = 0; i < 256; i++) { strbuf_addf(&buf, "/%02x", i); if (xmkdir(buf.buf, 0755) < 0) goto out; strbuf_remove(&buf, buf.len - 3, 3); } if (!strlen(farm_object_dir)) strbuf_copyout(&buf, farm_object_dir, sizeof(farm_object_dir)); ret = 0; out: if (ret) sd_err("Fail to create directory: %m"); strbuf_release(&buf); return ret; } static int get_trunk_sha1(uint32_t idx, const char *tag, unsigned char *outsha1) { int nr_logs = -1, ret = -1; struct snap_log *log_buf, *log_free = NULL; struct snap_file *snap_buf = NULL; log_free = log_buf = snap_log_read(&nr_logs); if (nr_logs < 0) goto out; for (int i = 0; i < nr_logs; i++, log_buf++) { if (log_buf->idx != idx && strcmp(log_buf->tag, tag)) continue; snap_buf = snap_file_read(log_buf->sha1); if (!snap_buf) goto out; memcpy(outsha1, snap_buf->trunk_sha1, SHA1_DIGEST_SIZE); ret = 0; goto out; } out: free(log_free); free(snap_buf); return ret; } static int notify_vdi_add(uint32_t vdi_id, uint32_t nr_copies) { int ret = -1; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char *buf = NULL; sd_init_req(&hdr, SD_OP_NOTIFY_VDI_ADD); hdr.vdi_state.new_vid = vdi_id; hdr.vdi_state.copies = nr_copies; hdr.vdi_state.set_bitmap = true; ret = dog_exec_req(sdhost, sdport, &hdr, buf); if (ret < 0) sd_err("Fail to notify vdi add event(%"PRIx32", %d)", vdi_id, nr_copies); if (rsp->result != SD_RES_SUCCESS) { sd_err("%s", sd_strerror(rsp->result)); ret = -1; } free(buf); return ret; } int farm_init(const char *path) { int ret = -1; if (create_directory(path) < 0) goto out; if (snap_init(farm_dir) < 0) goto out; return 0; out: if (ret) sd_err("Fail to init farm."); return ret; } bool farm_contain_snapshot(uint32_t idx, const char *tag) { unsigned char trunk_sha1[SHA1_DIGEST_SIZE]; return (get_trunk_sha1(idx, tag, trunk_sha1) == 0); } static void do_save_object(struct work *work) { void *buf; size_t size; struct snapshot_work *sw; if (uatomic_is_true(&work_error)) return; sw = container_of(work, struct snapshot_work, work); size = get_objsize(sw->entry.oid); buf = xmalloc(size); if (sd_read_object(sw->entry.oid, buf, size, 0, true) < 0) goto error; if (slice_write(buf, size, sw->entry.sha1) < 0) goto error; free(buf); return; error: free(buf); sd_err("Fail to save object, oid %"PRIx64, sw->entry.oid); uatomic_set_true(&work_error); } static void farm_show_progress(uint64_t done, uint64_t total) { return show_progress(done, total, true); } static void save_object_done(struct work *work) { struct snapshot_work *sw = container_of(work, struct snapshot_work, work); static unsigned long saved; if (uatomic_is_true(&work_error)) goto out; strbuf_add(sw->trunk_buf, &sw->entry, sizeof(struct trunk_entry)); farm_show_progress(uatomic_add_return(&saved, 1), object_tree_size()); out: free(sw); } static int queue_save_snapshot_work(uint64_t oid, int nr_copies, void *data) { struct snapshot_work *sw = xzalloc(sizeof(struct snapshot_work)); struct strbuf *trunk_buf = data; sw->entry.oid = oid; sw->entry.nr_copies = nr_copies; sw->trunk_buf = trunk_buf; sw->work.fn = do_save_object; sw->work.done = save_object_done; queue_work(wq, &sw->work); return 0; } int farm_save_snapshot(const char *tag) { unsigned char snap_sha1[SHA1_DIGEST_SIZE]; unsigned char trunk_sha1[SHA1_DIGEST_SIZE]; struct strbuf trunk_buf; void *snap_log = NULL; int log_nr, idx, ret = -1; uint64_t nr_objects = object_tree_size(); snap_log = snap_log_read(&log_nr); if (!snap_log) goto out; idx = log_nr + 1; strbuf_init(&trunk_buf, sizeof(struct trunk_entry) * nr_objects); wq = create_work_queue("save snapshot", WQ_ORDERED); if (for_each_object_in_tree(queue_save_snapshot_work, &trunk_buf) < 0) goto out; work_queue_wait(wq); if (uatomic_is_true(&work_error)) goto out; if (trunk_file_write(nr_objects, (struct trunk_entry *)trunk_buf.buf, trunk_sha1) < 0) goto out; if (snap_file_write(idx, trunk_sha1, snap_sha1) < 0) goto out; if (snap_log_write(idx, tag, snap_sha1) < 0) goto out; ret = 0; out: strbuf_release(&trunk_buf); free(snap_log); return ret; } static void do_load_object(struct work *work) { void *buffer = NULL; size_t size; struct snapshot_work *sw; static unsigned long loaded; if (uatomic_is_true(&work_error)) return; sw = container_of(work, struct snapshot_work, work); buffer = slice_read(sw->entry.sha1, &size); if (!buffer) goto error; if (sd_write_object(sw->entry.oid, 0, buffer, size, 0, 0, sw->entry.nr_copies, true, true) != 0) goto error; if (is_vdi_obj(sw->entry.oid)) { if (notify_vdi_add(oid_to_vid(sw->entry.oid), sw->entry.nr_copies) < 0) goto error; sd_write_lock(&vdi_list_lock); insert_vdi(buffer); sd_unlock(&vdi_list_lock); } farm_show_progress(uatomic_add_return(&loaded, 1), trunk_get_count()); free(buffer); return; error: free(buffer); sd_err("Fail to load object, oid %"PRIx64, sw->entry.oid); uatomic_set_true(&work_error); } static void load_object_done(struct work *work) { struct snapshot_work *sw = container_of(work, struct snapshot_work, work); free(sw); } static int queue_load_snapshot_work(struct trunk_entry *entry, void *data) { struct snapshot_work *sw = xzalloc(sizeof(struct snapshot_work)); memcpy(&sw->entry, entry, sizeof(struct trunk_entry)); sw->work.fn = do_load_object; sw->work.done = load_object_done; queue_work(wq, &sw->work); return 0; } int farm_load_snapshot(uint32_t idx, const char *tag) { int ret = -1; unsigned char trunk_sha1[SHA1_DIGEST_SIZE]; if (get_trunk_sha1(idx, tag, trunk_sha1) < 0) goto out; wq = create_work_queue("load snapshot", WQ_DYNAMIC); if (for_each_entry_in_trunk(trunk_sha1, queue_load_snapshot_work, NULL) < 0) goto out; work_queue_wait(wq); if (uatomic_is_true(&work_error)) goto out; if (create_active_vdis() < 0) goto out; ret = 0; out: free_vdi_list(); return ret; } sheepdog-0.7.5/dog/farm/farm.h000066400000000000000000000041051223630776600161310ustar00rootroot00000000000000#ifndef FARM_H #define FARM_H #include #include #include #include #include #include #include #include #include #include #include #include "dog.h" #include "sheep.h" #include "strbuf.h" #include "sha1.h" struct trunk_entry { uint64_t oid; int nr_copies; unsigned char sha1[SHA1_DIGEST_SIZE]; }; struct trunk_file { uint64_t nr_entries; struct trunk_entry *entries; }; struct snap_file { int idx; unsigned char trunk_sha1[SHA1_DIGEST_SIZE]; }; struct snap_log { uint32_t idx; char tag[SD_MAX_SNAPSHOT_TAG_LEN]; uint64_t time; unsigned char sha1[SHA1_DIGEST_SIZE]; }; /* farm.c */ int farm_init(const char *path); bool farm_contain_snapshot(uint32_t idx, const char *tag); int farm_save_snapshot(const char *tag); int farm_load_snapshot(uint32_t idx, const char *tag); char *get_object_directory(void); /* trunk.c */ int trunk_init(void); int trunk_file_write(uint64_t nr_entries, struct trunk_entry *entries, unsigned char *trunk_sha1); int for_each_entry_in_trunk(unsigned char *trunk_sha1, int (*func)(struct trunk_entry *entry, void *data), void *data); uint64_t trunk_get_count(void); /* snap.c */ int snap_init(const char *path); struct snap_file *snap_file_read(unsigned char *sha1); int snap_file_write(uint32_t idx, unsigned char *trunk_sha1, unsigned char *outsha1); void *snap_log_read(int *out_nr); int snap_log_write(uint32_t idx, const char *tag, unsigned char *sha1); /* sha1_file.c */ int sha1_file_write(void *buf, size_t len, unsigned char *sha1); void *sha1_file_read(const unsigned char *sha1, size_t *size); /* object_tree.c */ int object_tree_size(void); void object_tree_insert(uint64_t oid, int nr_copies); void object_tree_free(void); void object_tree_print(void); int for_each_object_in_tree(int (*func)(uint64_t oid, int nr_copies, void *data), void *data); /* slice.c */ int slice_write(void *buf, size_t len, unsigned char *outsha1); void *slice_read(const unsigned char *sha1, size_t *outsize); #endif sheepdog-0.7.5/dog/farm/object_tree.c000066400000000000000000000052161223630776600174700ustar00rootroot00000000000000/* * Copyright (C) 2013 Zelin.io * * Kai Zhang * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "farm.h" #include "rbtree.h" struct object_tree_entry { uint64_t oid; int nr_copies; struct rb_node node; struct list_head list; }; struct object_tree { int nr_objs; struct rb_root root; struct list_head list; }; static struct object_tree tree = { .nr_objs = 0, .root = RB_ROOT, .list = LIST_HEAD_INIT(tree.list) }; static struct object_tree_entry *cached_entry; static struct object_tree_entry *do_insert(struct rb_root *root, struct object_tree_entry *new) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct object_tree_entry *entry; while (*p) { parent = *p; entry = rb_entry(parent, struct object_tree_entry, node); if (new->oid < entry->oid) p = &(*p)->rb_left; else if (new->oid > entry->oid) p = &(*p)->rb_right; else return entry; /* already has this entry */ } rb_link_node(&new->node, parent, p); rb_insert_color(&new->node, root); return NULL; /* insert sucessfully */ } void object_tree_insert(uint64_t oid, int nr_copies) { struct rb_root *root = &tree.root; struct object_tree_entry *p = NULL; if (!cached_entry) cached_entry = xzalloc(sizeof(*cached_entry)); cached_entry->oid = oid; cached_entry->nr_copies = nr_copies; rb_init_node(&cached_entry->node); p = do_insert(root, cached_entry); if (!p) { list_add(&cached_entry->list, &tree.list); tree.nr_objs++; cached_entry = NULL; } } void object_tree_print(void) { struct rb_node *p = rb_first(&tree.root); struct object_tree_entry *entry; printf("nr_objs: %d\n", tree.nr_objs); while (p) { entry = rb_entry(p, struct object_tree_entry, node); printf("Obj id: %"PRIx64"\n", entry->oid); p = rb_next(p); } } void object_tree_free(void) { struct object_tree_entry *entry, *next; list_for_each_entry_safe(entry, next, &tree.list, list) free(entry); free(cached_entry); } int object_tree_size(void) { return tree.nr_objs; } int for_each_object_in_tree(int (*func)(uint64_t oid, int nr_copies, void *data), void *data) { struct rb_node *p = rb_first(&tree.root); struct object_tree_entry *entry; int ret = -1; while (p) { entry = rb_entry(p, struct object_tree_entry, node); if (func(entry->oid, entry->nr_copies, data) < 0) goto out; p = rb_next(p); } ret = 0; out: return ret; } sheepdog-0.7.5/dog/farm/sha1_file.c000066400000000000000000000063451223630776600170420ustar00rootroot00000000000000/* * Copyright (C) 2011 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* * sha1_file provide us some useful features: * * - Regardless of object type, all objects are all in deflated with zlib, * and have a header that not only specifies their tag, but also size * information about the data in the object. * * - the general consistency of an object can always be tested independently * of the contents or the type of the object: all objects can be validated * by verifying that their hashes match the content of the file. */ #include #include "farm.h" #include "util.h" static void fill_sha1_path(char *pathbuf, const unsigned char *sha1) { int i; for (i = 0; i < SHA1_DIGEST_SIZE; i++) { static const char hex[] = "0123456789abcdef"; unsigned int val = sha1[i]; char *pos = pathbuf + i*2 + (i > 0); *pos++ = hex[val >> 4]; *pos = hex[val & 0xf]; } } static char *sha1_to_path(const unsigned char *sha1) { static __thread char buf[PATH_MAX]; const char *objdir; int len; objdir = get_object_directory(); len = strlen(objdir); /* '/' + sha1(2) + '/' + sha1(38) + '\0' */ memcpy(buf, objdir, len); buf[len] = '/'; buf[len+3] = '/'; buf[len+42] = '\0'; fill_sha1_path(buf + len + 1, sha1); return buf; } static int sha1_buffer_write(const unsigned char *sha1, void *buf, unsigned int size) { char *filename = sha1_to_path(sha1); int fd, ret = 0, len; fd = open(filename, O_WRONLY | O_CREAT | O_EXCL, 0666); if (fd < 0) { if (errno != EEXIST) { sd_err("failed to open file %s with error: %m", filename); ret = -1; } goto err_open; } len = xwrite(fd, buf, size); if (len != size) { sd_err("%m"); close(fd); return -1; } close(fd); err_open: return ret; } int sha1_file_write(void *buf, size_t len, unsigned char *outsha1) { unsigned char sha1[SHA1_DIGEST_SIZE]; sha1_from_buffer(buf, len, sha1); if (sha1_buffer_write(sha1, buf, len) < 0) return -1; if (outsha1) memcpy(outsha1, sha1, SHA1_DIGEST_SIZE); return 0; } static int verify_sha1_file(const unsigned char *sha1, void *buf, unsigned long len) { unsigned char tmp[SHA1_DIGEST_SIZE]; sha1_from_buffer(buf, len, tmp); if (memcmp((char *)tmp, (char *)sha1, SHA1_DIGEST_SIZE) != 0) { sd_err("failed, %s != %s", sha1_to_hex(sha1), sha1_to_hex(tmp)); return -1; } return 0; } void *sha1_file_read(const unsigned char *sha1, size_t *size) { char *filename = sha1_to_path(sha1); int fd = open(filename, O_RDONLY); struct stat st; void *buf = NULL; if (fd < 0) { perror(filename); return NULL; } if (fstat(fd, &st) < 0) { sd_err("%m"); goto out; } buf = xmalloc(st.st_size); if (!buf) goto out; if (xread(fd, buf, st.st_size) != st.st_size) { free(buf); buf = NULL; goto out; } if (verify_sha1_file(sha1, buf, st.st_size) < 0) { free(buf); buf = NULL; goto out; } *size = st.st_size; out: close(fd); return buf; } sheepdog-0.7.5/dog/farm/slice.c000066400000000000000000000047031223630776600163020ustar00rootroot00000000000000/* * copyright (c) 2013 taobao inc. * * liu yuan * * this program is free software; you can redistribute it and/or * modify it under the terms of the gnu general public license version * 2 as published by the free software foundation. * * you should have received a copy of the gnu general public license * along with this program. if not, see . */ /* * Slice is a fixed chunk of one object to be stored in farm. We slice * the object into smaller chunks to get better deduplication. */ #include #include #include #include "farm.h" #include "strbuf.h" #include "util.h" #include "sheepdog_proto.h" struct slice { unsigned char sha1[SHA1_DIGEST_SIZE]; }; struct slice_file { uint32_t nr_slices; struct slice *slices; }; /* 128k, best empirical value from some tests, but no rationale */ #define SLICE_SIZE (1024*128) int slice_write(void *buf, size_t len, unsigned char *outsha1) { int count = DIV_ROUND_UP(len, SLICE_SIZE); size_t slen = count * SHA1_DIGEST_SIZE; char *sbuf = xmalloc(slen); char *p = buf; for (int i = 0; i < count; i++, p += SLICE_SIZE) { unsigned char sha1[SHA1_DIGEST_SIZE]; size_t wlen = (ssize_t)len - SLICE_SIZE > 0 ? SLICE_SIZE : len; len -= SLICE_SIZE; if (sha1_file_write(p, wlen, sha1) < 0) goto err; memcpy(sbuf + i * SHA1_DIGEST_SIZE, sha1, SHA1_DIGEST_SIZE); } if (sha1_file_write(sbuf, slen, outsha1) < 0) goto err; free(sbuf); return 0; err: free(sbuf); return -1; } static struct slice_file *slice_file_read(const unsigned char *sha1) { size_t size; struct slice_file *slice_file = NULL; void *buf = sha1_file_read(sha1, &size); if (!buf) return NULL; slice_file = xmalloc(sizeof(struct slice_file)); slice_file->nr_slices = size / SHA1_DIGEST_SIZE; slice_file->slices = buf; return slice_file; } void *slice_read(const unsigned char *sha1, size_t *outsize) { struct slice_file *file = slice_file_read(sha1); struct strbuf buf = STRBUF_INIT; void *object; if (!file) goto err; *outsize = 0; for (uint32_t i = 0; i < file->nr_slices; i++) { size_t size; void *sbuf; sbuf = sha1_file_read(file->slices[i].sha1, &size); if (!sbuf) goto err; strbuf_add(&buf, sbuf, size); free(sbuf); *outsize += size; } object = xmalloc(*outsize); strbuf_copyout(&buf, object, *outsize); free(file); strbuf_release(&buf); return object; err: free(file); strbuf_release(&buf); return NULL; } sheepdog-0.7.5/dog/farm/snap.c000066400000000000000000000050071223630776600161420ustar00rootroot00000000000000/* * Copyright (C) 2011 Taobao Inc. * Copyright (C) 2013 Zelin.io * * Liu Yuan * Kai Zhang * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* Snap object is the meta data that describes the cluster snapshot. */ #include #include #include #include "farm.h" static char snap_log_path[PATH_MAX]; int snap_init(const char *farm_dir) { int fd, ret = -1; struct strbuf buf = STRBUF_INIT; strbuf_addstr(&buf, farm_dir); strbuf_addf(&buf, "/%s", "user_snap"); if (!strlen(snap_log_path)) strbuf_copyout(&buf, snap_log_path, sizeof(snap_log_path)); fd = open(snap_log_path, O_CREAT | O_EXCL, 0666); if (fd < 0) { if (errno != EEXIST) { sd_err("%m"); goto out; } } ret = 0; close(fd); out: strbuf_release(&buf); return ret; } int snap_log_write(uint32_t idx, const char *tag, unsigned char *sha1) { int fd, ret = -1; struct strbuf buf = STRBUF_INIT; struct snap_log log = { .idx = idx, .time = time(NULL) }; pstrcpy(log.tag, SD_MAX_SNAPSHOT_TAG_LEN, tag); memcpy(log.sha1, sha1, SHA1_DIGEST_SIZE); fd = open(snap_log_path, O_WRONLY | O_APPEND); if (fd < 0) { sd_err("%m"); goto out; } strbuf_reset(&buf); strbuf_add(&buf, &log, sizeof(log)); ret = xwrite(fd, buf.buf, buf.len); if (ret != buf.len) goto out_close; ret = 0; out_close: close(fd); out: strbuf_release(&buf); return ret; } void *snap_log_read(int *out_nr) { struct stat st; void *buffer = NULL; int len, fd; fd = open(snap_log_path, O_RDONLY); if (fd < 0) { sd_err("%m"); goto out; } if (fstat(fd, &st) < 0) { sd_err("%m"); goto out_close; } len = st.st_size; buffer = xmalloc(len); len = xread(fd, buffer, len); if (len != st.st_size) { free(buffer); buffer = NULL; goto out_close; } *out_nr = len / sizeof(struct snap_log); out_close: close(fd); out: return buffer; } struct snap_file *snap_file_read(unsigned char *sha1) { size_t size; return sha1_file_read(sha1, &size); } int snap_file_write(uint32_t idx, unsigned char *trunk_sha1, unsigned char *outsha1) { struct snap_file snap; snap.idx = idx; memcpy(snap.trunk_sha1, trunk_sha1, SHA1_DIGEST_SIZE); return sha1_file_write(&snap, sizeof(struct snap_file), outsha1); } sheepdog-0.7.5/dog/farm/trunk.c000066400000000000000000000037261223630776600163520ustar00rootroot00000000000000/* * Copyright (C) 2011 Taobao Inc. * Copyright (C) 2013 Zelin.io * * Liu Yuan * Kai Zhang * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* * Trunk object is meta data that describes the structure of the data objects * at the time of snapshot being taken. It ties data objects together into a * flat directory structure. */ #include #include #include #include "farm.h" #include "strbuf.h" #include "list.h" #include "util.h" #include "sheepdog_proto.h" static uint64_t total_count; int trunk_file_write(uint64_t nr_entries, struct trunk_entry *entries, unsigned char *trunk_sha1) { size_t size = sizeof(struct trunk_entry) * nr_entries; return sha1_file_write(entries, size, trunk_sha1); } static struct trunk_file *trunk_file_read(unsigned char *sha1) { size_t size; struct trunk_file *trunk = NULL; void *buf = sha1_file_read(sha1, &size); if (!buf) return NULL; trunk = xmalloc(sizeof(struct trunk_file)); trunk->nr_entries = size / sizeof(struct trunk_entry); trunk->entries = buf; return trunk; } int for_each_entry_in_trunk(unsigned char *trunk_sha1, int (*func)(struct trunk_entry *entry, void *data), void *data) { struct trunk_file *trunk; struct trunk_entry *entry; int ret = -1; trunk = trunk_file_read(trunk_sha1); if (!trunk) { sd_err("failed to read trunk"); return ret; } total_count = trunk->nr_entries; entry = trunk->entries; for (uint64_t i = 0; i < trunk->nr_entries; i++, entry++) { if (func(entry, data) < 0) goto out; } ret = 0; out: free(trunk->entries); free(trunk); return ret; } uint64_t trunk_get_count(void) { return total_count; } sheepdog-0.7.5/dog/node.c000066400000000000000000000246371223630776600152130ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "dog.h" static struct node_cmd_data { bool all_nodes; bool recovery_progress; } node_cmd_data; static void cal_total_vdi_size(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { uint64_t *size = data; if (!vdi_is_snapshot(i)) *size += i->vdi_size; } static int node_list(int argc, char **argv) { int i; if (!raw_output) printf(" Id Host:Port V-Nodes Zone\n"); for (i = 0; i < sd_nodes_nr; i++) { const char *host = addr_to_str(sd_nodes[i].nid.addr, sd_nodes[i].nid.port); printf(raw_output ? "%d %s %d %u\n" : "%4d %-20s\t%2d%11u\n", i, host, sd_nodes[i].nr_vnodes, sd_nodes[i].zone); } return EXIT_SUCCESS; } static int node_info(int argc, char **argv) { int i, ret, success = 0; uint64_t total_size = 0, total_avail = 0, total_vdi_size = 0; char total_str[UINT64_DECIMAL_SIZE], use_str[UINT64_DECIMAL_SIZE], avail_str[UINT64_DECIMAL_SIZE], vdi_size_str[UINT64_DECIMAL_SIZE]; if (!raw_output) printf("Id\tSize\tUsed\tAvail\tUse%%\n"); for (i = 0; i < sd_nodes_nr; i++) { struct sd_req req; struct sd_rsp *rsp = (struct sd_rsp *)&req; char store_str[UINT64_DECIMAL_SIZE], used_str[UINT64_DECIMAL_SIZE], free_str[UINT64_DECIMAL_SIZE]; sd_init_req(&req, SD_OP_STAT_SHEEP); ret = send_light_req(&req, sd_nodes[i].nid.addr, sd_nodes[i].nid.port); size_to_str(rsp->node.store_size, store_str, sizeof(store_str)); size_to_str(rsp->node.store_free, free_str, sizeof(free_str)); size_to_str(rsp->node.store_size - rsp->node.store_free, used_str, sizeof(used_str)); if (!ret) { int ratio = (int)(((double)(rsp->node.store_size - rsp->node.store_free) / rsp->node.store_size) * 100); printf(raw_output ? "%d %s %s %s %d%%\n" : "%2d\t%s\t%s\t%s\t%3d%%\n", i, store_str, used_str, free_str, rsp->node.store_size == 0 ? 0 : ratio); success++; } total_size += rsp->node.store_size; total_avail += rsp->node.store_free; } if (success == 0) { sd_err("Cannot get information from any nodes"); return EXIT_SYSFAIL; } if (parse_vdi(cal_total_vdi_size, SD_INODE_HEADER_SIZE, &total_vdi_size) < 0) return EXIT_SYSFAIL; size_to_str(total_size, total_str, sizeof(total_str)); size_to_str(total_avail, avail_str, sizeof(avail_str)); size_to_str(total_size - total_avail, use_str, sizeof(use_str)); size_to_str(total_vdi_size, vdi_size_str, sizeof(vdi_size_str)); printf(raw_output ? "Total %s %s %s %d%% %s\n" : "Total\t%s\t%s\t%s\t%3d%%\n\n" "Total virtual image size\t%s\n", total_str, use_str, avail_str, (int)(((double)(total_size - total_avail) / total_size) * 100), vdi_size_str); return EXIT_SUCCESS; } static int get_recovery_state(struct recovery_state *state) { int ret; struct sd_req req; struct sd_rsp *rsp = (struct sd_rsp *)&req; sd_init_req(&req, SD_OP_STAT_RECOVERY); req.data_length = sizeof(*state); ret = dog_exec_req(sdhost, sdport, &req, state); if (ret < 0) { sd_err("Failed to execute request"); return -1; } if (rsp->result != SD_RES_SUCCESS) { sd_err("%s", sd_strerror(rsp->result)); return -1; } return 0; } static int node_recovery_progress(void) { int result; unsigned int prev_nr_total; struct recovery_state rstate; /* * ToDos * * 1. Calculate size of actually copied objects. * For doing this, not so trivial changes for recovery process are * required. * * 2. Print remaining physical time. * Even if it is not so acculate, the information is helpful for * administrators. */ result = get_recovery_state(&rstate); if (result < 0) return EXIT_SYSFAIL; if (!rstate.in_recovery) return EXIT_SUCCESS; do { prev_nr_total = rstate.nr_total; result = get_recovery_state(&rstate); if (result < 0) break; if (!rstate.in_recovery) { show_progress(prev_nr_total, prev_nr_total, true); break; } switch (rstate.state) { case RW_PREPARE_LIST: printf("\rpreparing a checked object list..."); break; case RW_NOTIFY_COMPLETION: printf("\rnotifying a completion of recovery..."); break; case RW_RECOVER_OBJ: show_progress(rstate.nr_finished, rstate.nr_total, true); break; default: panic("unknown state of recovery: %d", rstate.state); break; } sleep(1); } while (true); return result < 0 ? EXIT_SYSFAIL : EXIT_SUCCESS; } static int node_recovery(int argc, char **argv) { int i, ret; if (node_cmd_data.recovery_progress) return node_recovery_progress(); if (!raw_output) { printf("Nodes In Recovery:\n"); printf(" Id Host:Port V-Nodes Zone" " Progress\n"); } for (i = 0; i < sd_nodes_nr; i++) { struct sd_req req; struct sd_rsp *rsp = (struct sd_rsp *)&req; struct recovery_state state; memset(&state, 0, sizeof(state)); sd_init_req(&req, SD_OP_STAT_RECOVERY); req.data_length = sizeof(state); ret = dog_exec_req(sd_nodes[i].nid.addr, sd_nodes[i].nid.port, &req, &state); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("%s", sd_strerror(rsp->result)); return EXIT_FAILURE; } if (state.in_recovery) { const char *host = addr_to_str(sd_nodes[i].nid.addr, sd_nodes[i].nid.port); if (raw_output) printf("%d %s %d %d %"PRIu64" %"PRIu64"\n", i, host, sd_nodes[i].nr_vnodes, sd_nodes[i].zone, state.nr_finished, state.nr_total); else printf("%4d %-20s%5d%11d%11.1f%%\n", i, host, sd_nodes[i].nr_vnodes, sd_nodes[i].zone, 100 * (float)state.nr_finished / state.nr_total); } } return EXIT_SUCCESS; } static int node_kill(int argc, char **argv) { int node_id, ret; struct sd_req req; const char *p = argv[optind++]; if (!is_numeric(p)) { sd_err("Invalid node id '%s', please specify a numeric value", p); exit(EXIT_USAGE); } node_id = strtol(p, NULL, 10); if (node_id < 0 || node_id >= sd_nodes_nr) { sd_err("Invalid node id '%d'", node_id); exit(EXIT_USAGE); } sd_init_req(&req, SD_OP_KILL_NODE); ret = send_light_req(&req, sd_nodes[node_id].nid.addr, sd_nodes[node_id].nid.port); if (ret) { sd_err("Failed to execute request"); exit(EXIT_FAILURE); } return EXIT_SUCCESS; } static int node_md_info(struct node_id *nid) { struct sd_md_info info = {}; char size_str[UINT64_DECIMAL_SIZE], used_str[UINT64_DECIMAL_SIZE], avail_str[UINT64_DECIMAL_SIZE]; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret, i; sd_init_req(&hdr, SD_OP_MD_INFO); hdr.data_length = sizeof(info); ret = dog_exec_req(nid->addr, nid->port, &hdr, &info); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("failed to get multi-disk infomation: %s", sd_strerror(rsp->result)); return EXIT_FAILURE; } for (i = 0; i < info.nr; i++) { uint64_t size = info.disk[i].free + info.disk[i].used; int ratio = (int)(((double)info.disk[i].used / size) * 100); size_to_str(size, size_str, sizeof(size_str)); size_to_str(info.disk[i].used, used_str, sizeof(used_str)); size_to_str(info.disk[i].free, avail_str, sizeof(avail_str)); fprintf(stdout, "%2d\t%s\t%s\t%s\t%3d%%\t%s\n", info.disk[i].idx, size_str, used_str, avail_str, ratio, info.disk[i].path); } return EXIT_SUCCESS; } static int md_info(int argc, char **argv) { int i, ret; fprintf(stdout, "Id\tSize\tUsed\tAvail\tUse%%\tPath\n"); if (!node_cmd_data.all_nodes) { struct node_id nid = {.port = sdport}; memcpy(nid.addr, sdhost, sizeof(nid.addr)); return node_md_info(&nid); } for (i = 0; i < sd_nodes_nr; i++) { fprintf(stdout, "Node %d:\n", i); ret = node_md_info(&sd_nodes[i].nid); if (ret != EXIT_SUCCESS) return EXIT_FAILURE; } return EXIT_SUCCESS; } static int do_plug_unplug(char *disks, bool plug) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; if (!strlen(disks)) { sd_err("Empty path isn't allowed"); return EXIT_FAILURE; } if (plug) sd_init_req(&hdr, SD_OP_MD_PLUG); else sd_init_req(&hdr, SD_OP_MD_UNPLUG); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = strlen(disks) + 1; ret = dog_exec_req(sdhost, sdport, &hdr, disks); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to execute request, look for sheep.log" " for more information"); return EXIT_FAILURE; } return EXIT_SUCCESS; } static int md_plug(int argc, char **argv) { return do_plug_unplug(argv[optind], true); } static int md_unplug(int argc, char **argv) { return do_plug_unplug(argv[optind], false); } static struct subcommand node_md_cmd[] = { {"info", NULL, NULL, "show multi-disk information", NULL, CMD_NEED_NODELIST, md_info}, {"plug", NULL, NULL, "plug more disk(s) into node", NULL, CMD_NEED_ARG, md_plug}, {"unplug", NULL, NULL, "unplug disk(s) from node", NULL, CMD_NEED_ARG, md_unplug}, {NULL}, }; static int node_md(int argc, char **argv) { return do_generic_subcommand(node_md_cmd, argc, argv); } static int node_parser(int ch, char *opt) { switch (ch) { case 'A': node_cmd_data.all_nodes = true; break; case 'P': node_cmd_data.recovery_progress = true; break; } return 0; } static struct sd_option node_options[] = { {'A', "all", false, "show md information of all the nodes"}, {'P', "progress", false, "show progress of recovery in the node"}, { 0, NULL, false, NULL }, }; static struct subcommand node_cmd[] = { {"kill", "", "aprh", "kill node", NULL, CMD_NEED_ARG | CMD_NEED_NODELIST, node_kill}, {"list", NULL, "aprh", "list nodes", NULL, CMD_NEED_NODELIST, node_list}, {"info", NULL, "aprh", "show information about each node", NULL, CMD_NEED_NODELIST, node_info}, {"recovery", NULL, "aphPr", "show recovery information of nodes", NULL, CMD_NEED_NODELIST, node_recovery, node_options}, {"md", "[disks]", "apAh", "See 'dog node md' for more information", node_md_cmd, CMD_NEED_ARG, node_md, node_options}, {NULL,}, }; struct command node_command = { "node", node_cmd, node_parser }; sheepdog-0.7.5/dog/trace.c000066400000000000000000000212061223630776600153510ustar00rootroot00000000000000/* * Copyright (C) 2011 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include "dog.h" #include "rbtree.h" #include "list.h" static inline void print_thread_name(struct trace_graph_item *item) { printf("%-*s|", TRACE_THREAD_LEN, item->tname); } static inline void print_time(struct trace_graph_item *item) { if (item->type == TRACE_GRAPH_RETURN) { unsigned duration = item->return_time - item->entry_time; unsigned quot = duration / 1000, rem = duration % 1000; printf("%8u.%-3u|", quot, rem); } else if (item->type == TRACE_GRAPH_ENTRY) { printf(" |"); } } static inline void print_finale(struct trace_graph_item *item) { int i; for (i = 0; i < item->depth; i++) printf(" "); if (item->type == TRACE_GRAPH_ENTRY) printf("%s() {\n", item->fname); else printf("}\n"); } static void print_trace_item(struct trace_graph_item *item) { print_thread_name(item); print_time(item); print_finale(item); } static void cat_trace_file(void *buf, size_t size) { struct trace_graph_item *item = (struct trace_graph_item *)buf; size_t sz = size / sizeof(struct trace_graph_item), i; printf(" Thread Name | Time(us) | Function Graph\n"); printf("--------------------------------------------------\n"); for (i = 0; i < sz; i++) print_trace_item(item++); return; } static const char *tracefile = "/tmp/tracefile"; static int trace_read_buffer(void) { int ret, tfd; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; #define TRACE_BUF_LEN (1024 * 1024 * 20) char *buf = xzalloc(TRACE_BUF_LEN); tfd = open(tracefile, O_CREAT | O_RDWR | O_APPEND | O_TRUNC, 0644); if (tfd < 0) { sd_err("can't create tracefile"); return EXIT_SYSFAIL; } read_buffer: sd_init_req(&hdr, SD_OP_TRACE_READ_BUF); hdr.data_length = TRACE_BUF_LEN; ret = dog_exec_req(sdhost, sdport, &hdr, buf); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result == SD_RES_AGAIN) goto read_buffer; if (rsp->result != SD_RES_SUCCESS) { sd_err("Trace failed: %s", sd_strerror(rsp->result)); return EXIT_FAILURE; } xwrite(tfd, buf, rsp->data_length); if (rsp->data_length == TRACE_BUF_LEN) { memset(buf, 0, TRACE_BUF_LEN); goto read_buffer; } free(buf); return EXIT_SUCCESS; } static int trace_enable(int argc, char **argv) { const char *tracer = argv[optind]; int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; sd_init_req(&hdr, SD_OP_TRACE_ENABLE); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = strlen(tracer) + 1; ret = dog_exec_req(sdhost, sdport, &hdr, (void *)tracer); if (ret < 0) return EXIT_SYSFAIL; switch (rsp->result) { case SD_RES_SUCCESS: break; case SD_RES_NO_SUPPORT: sd_err("no such tracer %s", tracer); return EXIT_FAILURE; case SD_RES_INVALID_PARMS: sd_err("tracer %s is already enabled", tracer); return EXIT_FAILURE; default: sd_err("unknown error (%s)", sd_strerror(rsp->result)); return EXIT_SYSFAIL; } return EXIT_SUCCESS; } static int trace_disable(int argc, char **argv) { const char *tracer = argv[optind]; int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; sd_init_req(&hdr, SD_OP_TRACE_DISABLE); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = strlen(tracer) + 1; ret = dog_exec_req(sdhost, sdport, &hdr, (void *)tracer); if (ret < 0) return EXIT_SYSFAIL; switch (rsp->result) { case SD_RES_SUCCESS: break; case SD_RES_NO_SUPPORT: sd_err("no such tracer %s", tracer); return EXIT_FAILURE; case SD_RES_INVALID_PARMS: sd_err("tracer %s is not enabled", tracer); return EXIT_FAILURE; default: sd_err("unknown error (%s)", sd_strerror(rsp->result)); return EXIT_SYSFAIL; } return trace_read_buffer(); } static int trace_status(int argc, char **argv) { char buf[4096]; /* must have enough space to store tracer list */ int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; sd_init_req(&hdr, SD_OP_TRACE_STATUS); hdr.data_length = sizeof(buf); ret = dog_exec_req(sdhost, sdport, &hdr, buf); if (ret < 0) return EXIT_SYSFAIL; switch (rsp->result) { sd_err("%s", sd_strerror(rsp->result)); return EXIT_FAILURE; } printf("%s", buf); return EXIT_SUCCESS; } static void *map_trace_file(struct stat *st) { int fd = open(tracefile, O_RDONLY); void *map; if (fd < 0) { sd_err("%m"); return NULL; } if (fstat(fd, st) < 0) { sd_err("%m"); close(fd); return NULL; } if (st->st_size == 0) { sd_err("trace file is empty"); return NULL; } map = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0); close(fd); if (map == MAP_FAILED) { sd_err("%m"); return NULL; } return map; } static int graph_cat(int argc, char **argv) { struct stat st; void *map = map_trace_file(&st); if (!map) return EXIT_FAILURE; cat_trace_file(map, st.st_size); munmap(map, st.st_size); return EXIT_SUCCESS; } struct graph_stat_entry { struct rb_node rb; struct list_head list; char fname[TRACE_FNAME_LEN]; uint64_t duration; uint16_t nr_calls; }; static struct rb_root stat_tree_root; static LIST_HEAD(stat_list); static struct graph_stat_entry * stat_tree_insert(struct graph_stat_entry *new) { struct rb_node **p = &stat_tree_root.rb_node; struct rb_node *parent = NULL; struct graph_stat_entry *entry; while (*p) { int cmp; parent = *p; entry = rb_entry(parent, struct graph_stat_entry, rb); cmp = strcmp(new->fname, entry->fname); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) p = &(*p)->rb_right; else { entry->duration += new->duration; entry->nr_calls++; return entry; } } rb_link_node(&new->rb, parent, p); rb_insert_color(&new->rb, &stat_tree_root); return NULL; /* insert successfully */ } static void prepare_stat_tree(struct trace_graph_item *item) { struct graph_stat_entry *new; if (item->type != TRACE_GRAPH_RETURN) return; new = xmalloc(sizeof(*new)); pstrcpy(new->fname, sizeof(new->fname), item->fname); new->duration = item->return_time - item->entry_time; new->nr_calls = 1; INIT_LIST_HEAD(&new->list); if (stat_tree_insert(new)) { free(new); return; } list_add(&new->list, &stat_list); } static void stat_list_print(void) { struct graph_stat_entry *entry; list_for_each_entry(entry, &stat_list, list) { float total = (float)entry->duration / 1000000000; float per = (float)entry->duration / entry->nr_calls / 1000000; printf("%10.3f %10.3f %5"PRIu16" %-*s\n", total, per, entry->nr_calls, TRACE_FNAME_LEN, entry->fname); } } static int stat_list_cmp(void *priv, struct list_head *a, struct list_head *b) { struct graph_stat_entry *ga = container_of(a, struct graph_stat_entry, list); struct graph_stat_entry *gb = container_of(b, struct graph_stat_entry, list); /* '-' is for reverse sort, largest first */ return -intcmp(ga->duration, gb->duration); } static void stat_trace_file(void *buf, size_t size) { struct trace_graph_item *item = (struct trace_graph_item *)buf; size_t sz = size / sizeof(struct trace_graph_item), i; printf(" Total (s) Per Call (ms) Calls Name\n"); for (i = 0; i < sz; i++) prepare_stat_tree(item++); list_sort(NULL, &stat_list, stat_list_cmp); stat_list_print(); } static int graph_stat(int argc, char **argv) { struct stat st; void *map = map_trace_file(&st); if (!map) return EXIT_FAILURE; stat_trace_file(map, st.st_size); munmap(map, st.st_size); return EXIT_SUCCESS; } static int trace_parser(int ch, char *opt) { return 0; } static struct subcommand graph_cmd[] = { {"cat", NULL, NULL, "cat the output of graph tracer", NULL, 0, graph_cat}, {"stat", NULL, NULL, "get the stat of the graph calls", NULL, 0, graph_stat}, {NULL,}, }; static int trace_graph(int argc, char **argv) { return do_generic_subcommand(graph_cmd, argc, argv); } /* Subcommand list of trace */ static struct subcommand trace_cmd[] = { {"enable", "", "aph", "enable tracer", NULL, CMD_NEED_ARG, trace_enable}, {"disable", "", "aph", "disable tracer", NULL, CMD_NEED_ARG, trace_disable}, {"status", NULL, "aph", "show tracer statuses", NULL, 0, trace_status}, {"graph", NULL, "aph", "run dog trace graph for more information", graph_cmd, CMD_NEED_ARG, trace_graph}, {NULL}, }; struct command trace_command = { "trace", trace_cmd, trace_parser }; sheepdog-0.7.5/dog/treeview.c000066400000000000000000000074651223630776600161200ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include "util.h" #include "treeview.h" #ifndef MAX_DEPTH #define MAX_DEPTH 100 #endif struct vdi_tree { char name[1024]; char label[256]; uint32_t vid; uint32_t pvid; bool highlight; struct list_head children; struct list_head siblings; }; static int *width, *more; static struct vdi_tree *root; static struct vdi_tree *find_vdi(struct vdi_tree *parent, uint32_t vid, const char *name) { struct vdi_tree *vdi, *ret; list_for_each_entry(vdi, &parent->children, siblings) { if (vdi->vid == vid && !strcmp(vdi->name, name)) return vdi; ret = find_vdi(vdi, vid, name); if (ret) return ret; } return NULL; } static struct vdi_tree *new_vdi(const char *name, const char *label, uint64_t vid, uint64_t pvid, bool highlight) { struct vdi_tree *vdi; vdi = xmalloc(sizeof(struct vdi_tree)); pstrcpy(vdi->name, sizeof(vdi->name), name); pstrcpy(vdi->label, sizeof(vdi->label), label); vdi->vid = vid; vdi->pvid = pvid; vdi->highlight = highlight; INIT_LIST_HEAD(&vdi->children); return vdi; } void init_tree(void) { root = new_vdi("", "", 0, 0, 0); } void add_vdi_tree(const char *name, const char *label, uint32_t vid, uint32_t pvid, bool highlight) { struct vdi_tree *vdi, *parent; vdi = new_vdi(name, label, vid, pvid, highlight); if (!vdi) return; parent = find_vdi(root, pvid, name); if (!parent) parent = root; list_add_tail(&vdi->siblings, &parent->children); } static void compaction(struct vdi_tree *parent) { struct vdi_tree *vdi, *e, *new_parent; list_for_each_entry_safe(vdi, e, &parent->children, siblings) { new_parent = find_vdi(root, vdi->pvid, vdi->name); if (new_parent && parent != new_parent) list_move_tail(&vdi->siblings, &new_parent->children); compaction(vdi); } } static int get_depth(struct vdi_tree *parent) { struct vdi_tree *vdi; int max_depth = 0, depth; list_for_each_entry(vdi, &parent->children, siblings) { depth = get_depth(vdi); if (max_depth < depth) max_depth = depth; } return max_depth + 1; } static void spaces(int n) { while (n--) putchar(' '); } static void indent(int level, bool first, bool last) { int lvl; if (first) printf(last ? "---" : "-+-"); else { for (lvl = 0; lvl < level - 1; lvl++) { spaces(width[lvl] + 1); printf(more[lvl + 1] ? "| " : " "); } spaces(width[level - 1] + 1); printf(last ? "`-" : "|-"); } } static void _dump_tree(struct vdi_tree *current, int level, bool first, bool last) { struct vdi_tree *vdi; indent(level, first, last); if (current->highlight) printf(TEXT_BOLD); printf("%s", current->label); if (current->highlight) printf(TEXT_NORMAL); if (list_empty(¤t->children)) { putchar('\n'); return; } more[level] = !last; width[level] = strlen(current->label); list_for_each_entry(vdi, ¤t->children, siblings) { _dump_tree(vdi, level + 1, &vdi->siblings == current->children.next, vdi->siblings.next == ¤t->children); } } void dump_tree(void) { struct vdi_tree *vdi; int depth; compaction(root); depth = get_depth(root); width = malloc(sizeof(int) * depth); more = malloc(sizeof(int) * depth); if (!width || !more) { sd_err("Failed to allocate memory"); return; } list_for_each_entry(vdi, &root->children, siblings) { printf("%s", vdi->name); more[0] = 0; width[0] = strlen(vdi->name); _dump_tree(vdi, 1, true, true); } } sheepdog-0.7.5/dog/treeview.h000066400000000000000000000011711223630776600161110ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __TREEVIEW__ #define __TREEVIEW__ #include void init_tree(void); void add_vdi_tree(const char *label, const char *tag, uint32_t vid, uint32_t pvid, bool highlight); void dump_tree(void); #endif sheepdog-0.7.5/dog/vdi.c000066400000000000000000001521071223630776600150420ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include "dog.h" #include "treeview.h" #include "sha1.h" static struct sd_option vdi_options[] = { {'P', "prealloc", false, "preallocate all the data objects"}, {'i', "index", true, "specify the index of data objects"}, {'s', "snapshot", true, "specify a snapshot id or tag name"}, {'x', "exclusive", false, "write in an exclusive mode"}, {'d', "delete", false, "delete a key"}, {'w', "writeback", false, "use writeback mode"}, {'c', "copies", true, "specify the data redundancy (number of copies)"}, {'F', "from", true, "create a differential backup from the snapshot"}, {'f', "force", false, "do operation forcibly"}, { 0, NULL, false, NULL }, }; static struct vdi_cmd_data { unsigned int index; int snapshot_id; char snapshot_tag[SD_MAX_VDI_TAG_LEN]; bool exclusive; bool delete; bool prealloc; int nr_copies; bool writeback; int from_snapshot_id; char from_snapshot_tag[SD_MAX_VDI_TAG_LEN]; bool force; } vdi_cmd_data = { ~0, }; struct get_vdi_info { const char *name; const char *tag; uint32_t vid; uint32_t snapid; uint8_t nr_copies; }; static int parse_option_size(const char *value, uint64_t *ret) { char *postfix; double sizef; sizef = strtod(value, &postfix); switch (*postfix) { case 'T': sizef *= 1024; case 'G': sizef *= 1024; case 'M': sizef *= 1024; case 'K': case 'k': sizef *= 1024; case 'b': case '\0': *ret = (uint64_t) sizef; break; default: sd_err("Invalid size '%s'", value); sd_err("You may use k, M, G or T suffixes for " "kilobytes, megabytes, gigabytes and terabytes."); return -1; } return 0; } static void vdi_show_progress(uint64_t done, uint64_t total) { return show_progress(done, total, false); } static void print_vdi_list(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { int idx; bool is_clone = false; uint64_t my_objs, cow_objs; char vdi_size_str[16], my_objs_str[16], cow_objs_str[16]; time_t ti; struct tm tm; char dbuf[128]; struct get_vdi_info *info = data; if (info && strcmp(name, info->name) != 0) return; ti = i->create_time >> 32; if (raw_output) { snprintf(dbuf, sizeof(dbuf), "%" PRIu64, (uint64_t) ti); } else { localtime_r(&ti, &tm); strftime(dbuf, sizeof(dbuf), "%Y-%m-%d %H:%M", &tm); } my_objs = 0; cow_objs = 0; for (idx = 0; idx < MAX_DATA_OBJS; idx++) { if (!i->data_vdi_id[idx]) continue; if (is_data_obj_writeable(i, idx)) my_objs++; else cow_objs++; } size_to_str(i->vdi_size, vdi_size_str, sizeof(vdi_size_str)); size_to_str(my_objs * SD_DATA_OBJ_SIZE, my_objs_str, sizeof(my_objs_str)); size_to_str(cow_objs * SD_DATA_OBJ_SIZE, cow_objs_str, sizeof(cow_objs_str)); if (i->snap_id == 1 && i->parent_vdi_id != 0) is_clone = true; if (raw_output) { printf("%c ", vdi_is_snapshot(i) ? 's' : (is_clone ? 'c' : '=')); while (*name) { if (isspace(*name) || *name == '\\') putchar('\\'); putchar(*name++); } printf(" %d %s %s %s %s %" PRIx32 " %d %s\n", snapid, vdi_size_str, my_objs_str, cow_objs_str, dbuf, vid, i->nr_copies, i->tag); } else { printf("%c %-8s %5d %7s %7s %7s %s %7" PRIx32 " %5d %13s\n", vdi_is_snapshot(i) ? 's' : (is_clone ? 'c' : ' '), name, snapid, vdi_size_str, my_objs_str, cow_objs_str, dbuf, vid, i->nr_copies, i->tag); } } static void print_vdi_tree(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { time_t ti; struct tm tm; char buf[128]; if (vdi_is_snapshot(i)) { ti = i->create_time >> 32; localtime_r(&ti, &tm); strftime(buf, sizeof(buf), "[%Y-%m-%d %H:%M]", &tm); } else pstrcpy(buf, sizeof(buf), "(you are here)"); add_vdi_tree(name, buf, vid, i->parent_vdi_id, highlight && !vdi_is_snapshot(i)); } static void print_vdi_graph(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { time_t ti; struct tm tm; char dbuf[128], tbuf[128], size_str[128]; ti = i->create_time >> 32; localtime_r(&ti, &tm); strftime(dbuf, sizeof(dbuf), "%Y-%m-%d", &tm); strftime(tbuf, sizeof(tbuf), "%H:%M:%S", &tm); size_to_str(i->vdi_size, size_str, sizeof(size_str)); printf(" \"%x\" -> \"%x\";\n", i->parent_vdi_id, vid); printf(" \"%x\" [\n" " group = \"%s\",\n" " label = \"", vid, name); printf("Name: %10s\\n" "Tag: %10x\\n" "Size: %10s\\n" "Date: %10s\\n" "Time: %10s", name, snapid, size_str, dbuf, tbuf); if (vdi_is_snapshot(i)) printf("\"\n ];\n\n"); else printf("\",\n color=\"red\"\n ];\n\n"); } static void get_oid(uint32_t vid, const char *name, const char *tag, uint32_t snapid, uint32_t flags, const struct sd_inode *i, void *data) { struct get_vdi_info *info = data; if (info->name) { if (info->tag && info->tag[0]) { if (!strcmp(name, info->name) && !strcmp(tag, info->tag)) { info->vid = vid; info->nr_copies = i->nr_copies; } } else if (info->snapid) { if (!strcmp(name, info->name) && snapid == info->snapid) { info->vid = vid; info->nr_copies = i->nr_copies; } } else { if (!strcmp(name, info->name)) { info->vid = vid; info->nr_copies = i->nr_copies; } } } } typedef int (*obj_parser_func_t)(const char *sheep, uint64_t oid, struct sd_rsp *rsp, char *buf, void *data); static int do_print_obj(const char *sheep, uint64_t oid, struct sd_rsp *rsp, char *buf, void *data) { switch (rsp->result) { case SD_RES_SUCCESS: printf("%s has the object (should be %d copies)\n", sheep, rsp->obj.copies); break; case SD_RES_NO_OBJ: printf("%s doesn't have the object\n", sheep); break; case SD_RES_OLD_NODE_VER: case SD_RES_NEW_NODE_VER: sd_err("The node list has changed: please try again"); break; default: sd_err("%s: hit an unexpected error (%s)", sheep, sd_strerror(rsp->result)); break; } return 0; } struct get_data_oid_info { bool success; uint64_t data_oid; unsigned idx; }; static int get_data_oid(const char *sheep, uint64_t oid, struct sd_rsp *rsp, char *buf, void *data) { struct get_data_oid_info *info = data; struct sd_inode *inode = (struct sd_inode *)buf; switch (rsp->result) { case SD_RES_SUCCESS: if (info->success) break; info->success = true; if (inode->data_vdi_id[info->idx]) { info->data_oid = vid_to_data_oid(inode->data_vdi_id[info->idx], info->idx); return 1; } break; case SD_RES_NO_OBJ: break; case SD_RES_OLD_NODE_VER: case SD_RES_NEW_NODE_VER: sd_err("The node list has changed: please try again"); break; default: sd_err("%s: hit an unexpected error (%s)", sheep, sd_strerror(rsp->result)); break; } return 0; } static void parse_objs(uint64_t oid, obj_parser_func_t func, void *data, unsigned size) { int i, ret, cb_ret; char *buf; buf = xzalloc(size); for (i = 0; i < sd_nodes_nr; i++) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; sd_init_req(&hdr, SD_OP_READ_PEER); hdr.data_length = size; hdr.flags = 0; hdr.epoch = sd_epoch; hdr.obj.oid = oid; ret = dog_exec_req(sd_nodes[i].nid.addr, sd_nodes[i].nid.port, &hdr, buf); if (ret < 0) continue; switch (rsp->result) { sd_err("%s", sd_strerror(rsp->result)); continue; } untrim_zero_blocks(buf, rsp->obj.offset, rsp->data_length, size); cb_ret = func(addr_to_str(sd_nodes[i].nid.addr, sd_nodes[i].nid.port), oid, rsp, buf, data); if (cb_ret) break; } free(buf); } static int vdi_list(int argc, char **argv) { const char *vdiname = argv[optind]; if (!raw_output) printf(" Name Id Size Used Shared Creation time VDI id Copies Tag\n"); if (vdiname) { struct get_vdi_info info; memset(&info, 0, sizeof(info)); info.name = vdiname; if (parse_vdi(print_vdi_list, SD_INODE_SIZE, &info) < 0) return EXIT_SYSFAIL; return EXIT_SUCCESS; } else { if (parse_vdi(print_vdi_list, SD_INODE_SIZE, NULL) < 0) return EXIT_SYSFAIL; return EXIT_SUCCESS; } } static int vdi_tree(int argc, char **argv) { init_tree(); if (parse_vdi(print_vdi_tree, SD_INODE_HEADER_SIZE, NULL) < 0) return EXIT_SYSFAIL; dump_tree(); return EXIT_SUCCESS; } static int vdi_graph(int argc, char **argv) { /* print a header */ printf("digraph G {\n"); printf(" node [shape = \"box\", fontname = \"Courier\"];\n\n"); printf(" \"0\" [shape = \"ellipse\", label = \"root\"];\n\n"); if (parse_vdi(print_vdi_graph, SD_INODE_HEADER_SIZE, NULL) < 0) return EXIT_SYSFAIL; /* print a footer */ printf("}\n"); return EXIT_SUCCESS; } static int find_vdi_name(const char *vdiname, uint32_t snapid, const char *tag, uint32_t *vid, int for_snapshot) { int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; memset(buf, 0, sizeof(buf)); pstrcpy(buf, SD_MAX_VDI_LEN, vdiname); if (tag) pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, tag); if (for_snapshot) sd_init_req(&hdr, SD_OP_GET_VDI_INFO); else sd_init_req(&hdr, SD_OP_LOCK_VDI); hdr.data_length = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN; hdr.flags = SD_FLAG_CMD_WRITE; hdr.vdi.snapid = snapid; ret = dog_exec_req(sdhost, sdport, &hdr, buf); if (ret < 0) return -1; if (rsp->result != SD_RES_SUCCESS) { sd_err("Cannot get VDI info for %s %d %s: %s", vdiname, snapid, tag, sd_strerror(rsp->result)); return -1; } *vid = rsp->vdi.vdi_id; return 0; } static int read_vdi_obj(const char *vdiname, int snapid, const char *tag, uint32_t *pvid, struct sd_inode *inode, size_t size) { int ret; uint32_t vid; ret = find_vdi_name(vdiname, snapid, tag, &vid, 0); if (ret < 0) { sd_err("Failed to open VDI %s", vdiname); return EXIT_FAILURE; } ret = sd_read_object(vid_to_vdi_oid(vid), inode, size, 0, true); if (ret != SD_RES_SUCCESS) { if (snapid) { sd_err("Failed to read a snapshot %s:%d", vdiname, snapid); } else if (tag && tag[0]) { sd_err("Failed to read a snapshot %s:%s", vdiname, tag); } else { sd_err("Failed to read a vdi %s", vdiname); } return EXIT_FAILURE; } if (pvid) *pvid = vid; return EXIT_SUCCESS; } int do_vdi_create(const char *vdiname, int64_t vdi_size, uint32_t base_vid, uint32_t *vdi_id, bool snapshot, int nr_copies) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; char buf[SD_MAX_VDI_LEN]; memset(buf, 0, sizeof(buf)); pstrcpy(buf, SD_MAX_VDI_LEN, vdiname); sd_init_req(&hdr, SD_OP_NEW_VDI); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = SD_MAX_VDI_LEN; hdr.vdi.base_vdi_id = base_vid; hdr.vdi.snapid = snapshot ? 1 : 0; hdr.vdi.vdi_size = vdi_size; hdr.vdi.copies = nr_copies; ret = dog_exec_req(sdhost, sdport, &hdr, buf); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to create VDI %s: %s", vdiname, sd_strerror(rsp->result)); return EXIT_FAILURE; } if (vdi_id) *vdi_id = rsp->vdi.vdi_id; return EXIT_SUCCESS; } static int vdi_create(int argc, char **argv) { const char *vdiname = argv[optind++]; uint64_t size; uint32_t vid; uint64_t oid; int idx, max_idx, ret, nr_copies = vdi_cmd_data.nr_copies; struct sd_inode *inode = NULL; if (!argv[optind]) { sd_err("Please specify the VDI size"); return EXIT_USAGE; } ret = parse_option_size(argv[optind], &size); if (ret < 0) return EXIT_USAGE; if (size > SD_MAX_VDI_SIZE) { sd_err("VDI size is too large"); return EXIT_USAGE; } if (nr_copies > sd_nodes_nr) { sd_err("There are not enough nodes(%d) to hold the copies(%d)", sd_nodes_nr, nr_copies); return EXIT_USAGE; } ret = do_vdi_create(vdiname, size, 0, &vid, false, vdi_cmd_data.nr_copies); if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc) goto out; inode = xmalloc(sizeof(*inode)); ret = sd_read_object(vid_to_vdi_oid(vid), inode, sizeof(*inode), 0, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read a newly created VDI object"); ret = EXIT_FAILURE; goto out; } max_idx = DIV_ROUND_UP(size, SD_DATA_OBJ_SIZE); for (idx = 0; idx < max_idx; idx++) { vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size); oid = vid_to_data_oid(vid, idx); ret = sd_write_object(oid, 0, NULL, 0, 0, 0, inode->nr_copies, true, true); if (ret != SD_RES_SUCCESS) { ret = EXIT_FAILURE; goto out; } inode->data_vdi_id[idx] = vid; ret = sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid), SD_INODE_HEADER_SIZE + sizeof(vid) * idx, 0, inode->nr_copies, false, true); if (ret) { ret = EXIT_FAILURE; goto out; } } vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size); ret = EXIT_SUCCESS; if (verbose) { if (raw_output) printf("%x\n", vid); else printf("VDI ID of newly created VDI: %x\n", vid); } out: free(inode); return ret; } static int vdi_snapshot(int argc, char **argv) { const char *vdiname = argv[optind++]; uint32_t vid; int ret; char buf[SD_INODE_HEADER_SIZE]; struct sd_inode *inode = (struct sd_inode *)buf; if (vdi_cmd_data.snapshot_id != 0) { sd_err("Please specify a non-integer value for " "a snapshot tag name"); return EXIT_USAGE; } ret = read_vdi_obj(vdiname, 0, "", &vid, inode, SD_INODE_HEADER_SIZE); if (ret != EXIT_SUCCESS) return ret; ret = sd_write_object(vid_to_vdi_oid(vid), 0, vdi_cmd_data.snapshot_tag, SD_MAX_VDI_TAG_LEN, offsetof(struct sd_inode, tag), 0, inode->nr_copies, false, false); if (ret != SD_RES_SUCCESS) return EXIT_FAILURE; ret = do_vdi_create(vdiname, inode->vdi_size, vid, NULL, true, inode->nr_copies); if (ret == EXIT_SUCCESS && verbose) { if (raw_output) printf("%x\n", vid); else printf("VDI ID of newly created snapshot: %x\n", vid); } return ret; } static int vdi_clone(int argc, char **argv) { const char *src_vdi = argv[optind++], *dst_vdi; uint32_t base_vid, new_vid; uint64_t oid; int idx, max_idx, ret; struct sd_inode *inode = NULL; char *buf = NULL; dst_vdi = argv[optind]; if (!dst_vdi) { sd_err("Destination VDI name must be specified"); ret = EXIT_USAGE; goto out; } if (!vdi_cmd_data.snapshot_id && !vdi_cmd_data.snapshot_tag[0]) { sd_err("Only snapshot VDIs can be cloned"); sd_err("Please specify the '-s' option"); ret = EXIT_USAGE; goto out; } inode = xmalloc(sizeof(*inode)); ret = read_vdi_obj(src_vdi, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, &base_vid, inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; ret = do_vdi_create(dst_vdi, inode->vdi_size, base_vid, &new_vid, false, vdi_cmd_data.nr_copies); if (ret != EXIT_SUCCESS || !vdi_cmd_data.prealloc) goto out; buf = xzalloc(SD_DATA_OBJ_SIZE); max_idx = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE); for (idx = 0; idx < max_idx; idx++) { vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size); if (inode->data_vdi_id[idx]) { oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); ret = sd_read_object(oid, buf, SD_DATA_OBJ_SIZE, 0, true); if (ret) { ret = EXIT_FAILURE; goto out; } } else memset(buf, 0, SD_DATA_OBJ_SIZE); oid = vid_to_data_oid(new_vid, idx); ret = sd_write_object(oid, 0, buf, SD_DATA_OBJ_SIZE, 0, 0, inode->nr_copies, true, true); if (ret != SD_RES_SUCCESS) { ret = EXIT_FAILURE; goto out; } ret = sd_write_object(vid_to_vdi_oid(new_vid), 0, &new_vid, sizeof(new_vid), SD_INODE_HEADER_SIZE + sizeof(new_vid) * idx, 0, inode->nr_copies, false, true); if (ret) { ret = EXIT_FAILURE; goto out; } } vdi_show_progress(idx * SD_DATA_OBJ_SIZE, inode->vdi_size); ret = EXIT_SUCCESS; if (verbose) { if (raw_output) printf("%x\n", new_vid); else printf("VDI ID of newly created clone: %x\n", new_vid); } out: free(inode); free(buf); return ret; } static int vdi_resize(int argc, char **argv) { const char *vdiname = argv[optind++]; uint64_t new_size; uint32_t vid; int ret; char buf[SD_INODE_HEADER_SIZE]; struct sd_inode *inode = (struct sd_inode *)buf; if (!argv[optind]) { sd_err("Please specify the new size for the VDI"); return EXIT_USAGE; } ret = parse_option_size(argv[optind], &new_size); if (ret < 0) return EXIT_USAGE; if (new_size > SD_MAX_VDI_SIZE) { sd_err("New VDI size is too large"); return EXIT_USAGE; } ret = read_vdi_obj(vdiname, 0, "", &vid, inode, SD_INODE_HEADER_SIZE); if (ret != EXIT_SUCCESS) return ret; if (new_size < inode->vdi_size) { sd_err("Shrinking VDIs is not implemented"); return EXIT_USAGE; } inode->vdi_size = new_size; ret = sd_write_object(vid_to_vdi_oid(vid), 0, inode, SD_INODE_HEADER_SIZE, 0, 0, inode->nr_copies, false, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to update an inode header"); return EXIT_FAILURE; } return EXIT_SUCCESS; } static int do_vdi_delete(const char *vdiname, int snap_id, const char *snap_tag) { int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char data[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; uint32_t vid; ret = find_vdi_name(vdiname, snap_id, snap_tag, &vid, 0); if (ret < 0) { sd_err("Failed to open VDI %s", vdiname); return EXIT_FAILURE; } sd_init_req(&hdr, SD_OP_DELETE_CACHE); hdr.obj.oid = vid_to_vdi_oid(vid); ret = send_light_req(&hdr, sdhost, sdport); if (ret) { sd_err("failed to execute request"); return EXIT_FAILURE; } sd_init_req(&hdr, SD_OP_DEL_VDI); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = sizeof(data); hdr.vdi.snapid = snap_id; memset(data, 0, sizeof(data)); pstrcpy(data, SD_MAX_VDI_LEN, vdiname); if (snap_tag) pstrcpy(data + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag); ret = dog_exec_req(sdhost, sdport, &hdr, data); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("Failed to delete %s: %s", vdiname, sd_strerror(rsp->result)); if (rsp->result == SD_RES_NO_VDI) return EXIT_MISSING; else return EXIT_FAILURE; } return EXIT_SUCCESS; } static int vdi_delete(int argc, char **argv) { char *vdiname = argv[optind]; return do_vdi_delete(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag); } static int vdi_rollback(int argc, char **argv) { const char *vdiname = argv[optind++]; uint32_t base_vid, new_vid; int ret; char buf[SD_INODE_HEADER_SIZE]; struct sd_inode *inode = (struct sd_inode *)buf; if (!vdi_cmd_data.snapshot_id && !vdi_cmd_data.snapshot_tag[0]) { sd_err("Please specify the '-s' option"); return EXIT_USAGE; } ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, &base_vid, inode, SD_INODE_HEADER_SIZE); if (ret != EXIT_SUCCESS) return ret; if (!vdi_cmd_data.force) confirm("This operation dicards any changes made since the" " previous\nsnapshot was taken. Continue? [yes/no]: "); ret = do_vdi_delete(vdiname, 0, NULL); if (ret != SD_RES_SUCCESS) { sd_err("Failed to delete the current state"); return EXIT_FAILURE; } ret = do_vdi_create(vdiname, inode->vdi_size, base_vid, &new_vid, false, vdi_cmd_data.nr_copies); if (ret == EXIT_SUCCESS && verbose) { if (raw_output) printf("%x\n", new_vid); else printf("New VDI ID of rollbacked VDI: %x\n", new_vid); } return ret; } static int vdi_object(int argc, char **argv) { const char *vdiname = argv[optind]; unsigned idx = vdi_cmd_data.index; struct get_vdi_info info; uint32_t vid; memset(&info, 0, sizeof(info)); info.name = vdiname; info.tag = vdi_cmd_data.snapshot_tag; info.vid = 0; info.snapid = vdi_cmd_data.snapshot_id; if (parse_vdi(get_oid, SD_INODE_HEADER_SIZE, &info) < 0) return EXIT_SYSFAIL; vid = info.vid; if (vid == 0) { sd_err("VDI not found"); return EXIT_MISSING; } if (idx == ~0) { printf("Looking for the inode object 0x%" PRIx32 " with %d nodes\n\n", vid, sd_nodes_nr); parse_objs(vid_to_vdi_oid(vid), do_print_obj, NULL, SD_INODE_SIZE); } else { struct get_data_oid_info oid_info = {0}; oid_info.success = false; oid_info.idx = idx; if (idx >= MAX_DATA_OBJS) { printf("The offset is too large!\n"); exit(EXIT_FAILURE); } parse_objs(vid_to_vdi_oid(vid), get_data_oid, &oid_info, SD_DATA_OBJ_SIZE); if (oid_info.success) { if (oid_info.data_oid) { printf("Looking for the object 0x%" PRIx64 " (the inode vid 0x%" PRIx32 " idx %u) with %d nodes\n\n", oid_info.data_oid, vid, idx, sd_nodes_nr); parse_objs(oid_info.data_oid, do_print_obj, NULL, SD_DATA_OBJ_SIZE); } else printf("The inode object 0x%" PRIx32 " idx %u is not allocated\n", vid, idx); } else sd_err("Failed to read the inode object 0x%" PRIx32, vid); } return EXIT_SUCCESS; } static int do_track_object(uint64_t oid, uint8_t nr_copies) { int i, j, ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct sd_vnode *vnodes; const struct sd_vnode *vnode_buf[SD_MAX_COPIES]; struct epoch_log *logs; int vnodes_nr, nr_logs, log_length; log_length = sd_epoch * sizeof(struct epoch_log); logs = xmalloc(log_length); vnodes = xmalloc(sizeof(*vnodes) * SD_MAX_VNODES); sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; ret = dog_exec_req(sdhost, sdport, &hdr, logs); if (ret < 0) goto error; if (rsp->result != SD_RES_SUCCESS) { printf("%s\n", sd_strerror(rsp->result)); goto error; } nr_logs = rsp->data_length / sizeof(struct epoch_log); for (i = nr_logs - 1; i >= 0; i--) { printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n", oid, logs[i].epoch, nr_copies); printf("---------------------------------------------------\n"); /* * When # of nodes is less than nr_copies, we only print * remaining nodes that holds all the remaining copies. */ if (logs[i].nr_nodes < nr_copies) { for (j = 0; j < logs[i].nr_nodes; j++) { const struct node_id *n = &logs[i].nodes[j].nid; printf("%s\n", addr_to_str(n->addr, n->port)); } continue; } vnodes_nr = nodes_to_vnodes(logs[i].nodes, logs[i].nr_nodes, vnodes); oid_to_vnodes(vnodes, vnodes_nr, oid, nr_copies, vnode_buf); for (j = 0; j < nr_copies; j++) { const struct node_id *n = &vnode_buf[j]->nid; printf("%s\n", addr_to_str(n->addr, n->port)); } } free(logs); free(vnodes); return EXIT_SUCCESS; error: free(logs); free(vnodes); return EXIT_SYSFAIL; } static int vdi_track(int argc, char **argv) { const char *vdiname = argv[optind]; unsigned idx = vdi_cmd_data.index; struct get_vdi_info info; struct get_data_oid_info oid_info = {0}; uint32_t vid; uint8_t nr_copies; memset(&info, 0, sizeof(info)); info.name = vdiname; info.tag = vdi_cmd_data.snapshot_tag; info.vid = 0; info.snapid = vdi_cmd_data.snapshot_id; if (parse_vdi(get_oid, SD_INODE_HEADER_SIZE, &info) < 0) return EXIT_SYSFAIL; vid = info.vid; nr_copies = info.nr_copies; if (vid == 0) { sd_err("VDI not found"); return EXIT_MISSING; } if (idx == ~0) { printf("Tracking the inode object 0x%" PRIx32 " with %d nodes\n", vid, sd_nodes_nr); return do_track_object(vid_to_vdi_oid(vid), nr_copies); } oid_info.success = false; oid_info.idx = idx; if (idx >= MAX_DATA_OBJS) { printf("The offset is too large!\n"); goto err; } parse_objs(vid_to_vdi_oid(vid), get_data_oid, &oid_info, SD_DATA_OBJ_SIZE); if (!oid_info.success) { sd_err("Failed to read the inode object 0x%" PRIx32, vid); goto err; } if (!oid_info.data_oid) { printf("The inode object 0x%"PRIx32" idx %u is not allocated\n", vid, idx); goto err; } printf("Tracking the object 0x%" PRIx64 " (the inode vid 0x%" PRIx32 " idx %u)" " with %d nodes\n", oid_info.data_oid, vid, idx, sd_nodes_nr); return do_track_object(oid_info.data_oid, nr_copies); err: return EXIT_FAILURE; } static int find_vdi_attr_oid(const char *vdiname, const char *tag, uint32_t snapid, const char *key, void *value, unsigned int value_len, uint32_t *vid, uint64_t *oid, unsigned int *nr_copies, bool create, bool excl, bool delete) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; struct sheepdog_vdi_attr vattr; memset(&vattr, 0, sizeof(vattr)); pstrcpy(vattr.name, SD_MAX_VDI_LEN, vdiname); pstrcpy(vattr.tag, SD_MAX_VDI_TAG_LEN, vdi_cmd_data.snapshot_tag); vattr.snap_id = vdi_cmd_data.snapshot_id; pstrcpy(vattr.key, SD_MAX_VDI_ATTR_KEY_LEN, key); if (value && value_len) { vattr.value_len = value_len; memcpy(vattr.value, value, value_len); } sd_init_req(&hdr, SD_OP_GET_VDI_ATTR); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = SD_ATTR_OBJ_SIZE; hdr.vdi.snapid = vdi_cmd_data.snapshot_id; if (create) hdr.flags |= SD_FLAG_CMD_CREAT; if (excl) hdr.flags |= SD_FLAG_CMD_EXCL; if (delete) hdr.flags |= SD_FLAG_CMD_DEL; ret = dog_exec_req(sdhost, sdport, &hdr, &vattr); if (ret < 0) return SD_RES_EIO; if (rsp->result != SD_RES_SUCCESS) return rsp->result; *vid = rsp->vdi.vdi_id; *oid = vid_to_attr_oid(rsp->vdi.vdi_id, rsp->vdi.attr_id); *nr_copies = rsp->vdi.copies; return SD_RES_SUCCESS; } static int vdi_setattr(int argc, char **argv) { int ret, value_len = 0; uint64_t attr_oid = 0; uint32_t vid = 0, nr_copies = 0; const char *vdiname = argv[optind++], *key; char *value; uint64_t offset; key = argv[optind++]; if (!key) { sd_err("Please specify the attribute key"); return EXIT_USAGE; } value = argv[optind++]; if (!value && !vdi_cmd_data.delete) { value = xmalloc(SD_MAX_VDI_ATTR_VALUE_LEN); offset = 0; reread: ret = read(STDIN_FILENO, value + offset, SD_MAX_VDI_ATTR_VALUE_LEN - offset); if (ret < 0) { sd_err("Failed to read attribute value from stdin: %m"); return EXIT_SYSFAIL; } if (ret > 0) { offset += ret; goto reread; } } if (value) value_len = strlen(value); ret = find_vdi_attr_oid(vdiname, vdi_cmd_data.snapshot_tag, vdi_cmd_data.snapshot_id, key, value, value_len, &vid, &attr_oid, &nr_copies, !vdi_cmd_data.delete, vdi_cmd_data.exclusive, vdi_cmd_data.delete); if (ret) { if (ret == SD_RES_VDI_EXIST) { sd_err("The attribute '%s' already exists", key); return EXIT_EXISTS; } else if (ret == SD_RES_NO_OBJ) { sd_err("Attribute '%s' not found", key); return EXIT_MISSING; } else if (ret == SD_RES_NO_VDI) { sd_err("VDI not found"); return EXIT_MISSING; } else sd_err("Failed to set attribute: %s", sd_strerror(ret)); return EXIT_FAILURE; } return EXIT_SUCCESS; } static int vdi_getattr(int argc, char **argv) { int ret; uint64_t oid, attr_oid = 0; uint32_t vid = 0, nr_copies = 0; const char *vdiname = argv[optind++], *key; struct sheepdog_vdi_attr vattr; key = argv[optind++]; if (!key) { sd_err("Please specify the attribute key"); return EXIT_USAGE; } ret = find_vdi_attr_oid(vdiname, vdi_cmd_data.snapshot_tag, vdi_cmd_data.snapshot_id, key, NULL, 0, &vid, &attr_oid, &nr_copies, false, false, false); if (ret == SD_RES_NO_OBJ) { sd_err("Attribute '%s' not found", key); return EXIT_MISSING; } else if (ret == SD_RES_NO_VDI) { sd_err("VDI not found"); return EXIT_MISSING; } else if (ret) { sd_err("Failed to find attribute oid: %s", sd_strerror(ret)); return EXIT_MISSING; } oid = attr_oid; ret = sd_read_object(oid, &vattr, SD_ATTR_OBJ_SIZE, 0, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read attribute oid: %s", sd_strerror(ret)); return EXIT_SYSFAIL; } xwrite(STDOUT_FILENO, vattr.value, vattr.value_len); return EXIT_SUCCESS; } static int vdi_read(int argc, char **argv) { const char *vdiname = argv[optind++]; int ret, idx; struct sd_inode *inode = NULL; uint64_t offset = 0, oid, done = 0, total = (uint64_t) -1; unsigned int len; char *buf = NULL; if (argv[optind]) { ret = parse_option_size(argv[optind++], &offset); if (ret < 0) return EXIT_USAGE; if (argv[optind]) { ret = parse_option_size(argv[optind++], &total); if (ret < 0) return EXIT_USAGE; } } inode = malloc(sizeof(*inode)); buf = xmalloc(SD_DATA_OBJ_SIZE); ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, NULL, inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; if (inode->vdi_size < offset) { sd_err("Read offset is beyond the end of the VDI"); ret = EXIT_FAILURE; goto out; } total = min(total, inode->vdi_size - offset); idx = offset / SD_DATA_OBJ_SIZE; offset %= SD_DATA_OBJ_SIZE; while (done < total) { len = min(total - done, SD_DATA_OBJ_SIZE - offset); if (inode->data_vdi_id[idx]) { oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); ret = sd_read_object(oid, buf, len, offset, false); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read VDI"); ret = EXIT_FAILURE; goto out; } } else memset(buf, 0, len); ret = xwrite(STDOUT_FILENO, buf, len); if (ret < 0) { sd_err("Failed to write to stdout: %m"); ret = EXIT_SYSFAIL; goto out; } offset = 0; idx++; done += len; } fsync(STDOUT_FILENO); ret = EXIT_SUCCESS; out: free(inode); free(buf); return ret; } static int vdi_write(int argc, char **argv) { const char *vdiname = argv[optind++]; uint32_t vid, flags; int ret, idx; struct sd_inode *inode = NULL; uint64_t offset = 0, oid, old_oid, done = 0, total = (uint64_t) -1; unsigned int len; char *buf = NULL; bool create; if (argv[optind]) { ret = parse_option_size(argv[optind++], &offset); if (ret < 0) return EXIT_USAGE; if (argv[optind]) { ret = parse_option_size(argv[optind++], &total); if (ret < 0) return EXIT_USAGE; } } inode = xmalloc(sizeof(*inode)); buf = xmalloc(SD_DATA_OBJ_SIZE); ret = read_vdi_obj(vdiname, 0, "", &vid, inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; if (inode->vdi_size < offset) { sd_err("Write offset is beyond the end of the VDI"); ret = EXIT_FAILURE; goto out; } total = min(total, inode->vdi_size - offset); idx = offset / SD_DATA_OBJ_SIZE; offset %= SD_DATA_OBJ_SIZE; while (done < total) { create = false; old_oid = 0; flags = 0; len = min(total - done, SD_DATA_OBJ_SIZE - offset); if (!inode->data_vdi_id[idx]) create = true; else if (!is_data_obj_writeable(inode, idx)) { create = true; old_oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); } if (vdi_cmd_data.writeback) flags |= SD_FLAG_CMD_CACHE; ret = xread(STDIN_FILENO, buf, len); if (ret < 0) { sd_err("Failed to read from stdin: %m"); ret = EXIT_SYSFAIL; goto out; } else if (ret < len) { /* exit after this buffer is sent */ memset(buf + ret, 0, len - ret); total = done + len; } inode->data_vdi_id[idx] = inode->vdi_id; oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); ret = sd_write_object(oid, old_oid, buf, len, offset, flags, inode->nr_copies, create, false); if (ret != SD_RES_SUCCESS) { sd_err("Failed to write VDI"); ret = EXIT_FAILURE; goto out; } if (create) { ret = sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid), SD_INODE_HEADER_SIZE + sizeof(vid) * idx, flags, inode->nr_copies, false, false); if (ret) { ret = EXIT_FAILURE; goto out; } } offset += len; if (offset == SD_DATA_OBJ_SIZE) { offset = 0; idx++; } done += len; } ret = EXIT_SUCCESS; out: free(inode); free(buf); return ret; } static void *read_object_from(const struct sd_vnode *vnode, uint64_t oid) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; void *buf; size_t size = get_objsize(oid); buf = xmalloc(size); sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = sd_epoch; hdr.flags = 0; hdr.data_length = size; hdr.obj.oid = oid; ret = dog_exec_req(vnode->nid.addr, vnode->nid.port, &hdr, buf); if (ret < 0) exit(EXIT_SYSFAIL); switch (rsp->result) { case SD_RES_SUCCESS: untrim_zero_blocks(buf, rsp->obj.offset, rsp->data_length, size); break; case SD_RES_NO_OBJ: free(buf); return NULL; default: sd_err("FATAL: failed to read %"PRIx64", %s", oid, sd_strerror(rsp->result)); exit(EXIT_FAILURE); } return buf; } static void write_object_to(const struct sd_vnode *vnode, uint64_t oid, void *buf, bool create) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret; if (create) sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_PEER); else sd_init_req(&hdr, SD_OP_WRITE_PEER); hdr.epoch = sd_epoch; hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = get_objsize(oid); hdr.obj.oid = oid; ret = dog_exec_req(vnode->nid.addr, vnode->nid.port, &hdr, buf); if (ret < 0) exit(EXIT_SYSFAIL); if (rsp->result != SD_RES_SUCCESS) { sd_err("FATAL: failed to write %"PRIx64", %s", oid, sd_strerror(rsp->result)); exit(EXIT_FAILURE); } } struct vdi_check_work { struct vdi_check_info *info; const struct sd_vnode *vnode; uint8_t hash[SHA1_DIGEST_SIZE]; bool object_found; struct work work; }; struct vdi_check_info { uint64_t oid; int nr_copies; uint64_t total; uint64_t *done; int refcnt; struct work_queue *wq; struct vdi_check_work *base; struct vdi_check_work vcw[0]; }; static void free_vdi_check_info(struct vdi_check_info *info) { if (info->done) { *info->done += SD_DATA_OBJ_SIZE; vdi_show_progress(*info->done, info->total); } free(info); } static void vdi_repair_work(struct work *work) { struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, work); struct vdi_check_info *info = vcw->info; void *buf; buf = read_object_from(info->base->vnode, info->oid); write_object_to(vcw->vnode, info->oid, buf, !vcw->object_found); free(buf); } static void vdi_repair_main(struct work *work) { struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, work); struct vdi_check_info *info = vcw->info; if (vcw->object_found) fprintf(stdout, "fixed replica %"PRIx64"\n", info->oid); else fprintf(stdout, "fixed missing %"PRIx64"\n", info->oid); info->refcnt--; if (info->refcnt == 0) free_vdi_check_info(info); } static void vdi_hash_check_work(struct work *work) { struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, work); struct vdi_check_info *info = vcw->info; int ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; sd_init_req(&hdr, SD_OP_GET_HASH); hdr.obj.oid = info->oid; hdr.obj.tgt_epoch = sd_epoch; ret = dog_exec_req(vcw->vnode->nid.addr, vcw->vnode->nid.port, &hdr, NULL); if (ret < 0) exit(EXIT_SYSFAIL); switch (rsp->result) { case SD_RES_SUCCESS: vcw->object_found = true; memcpy(vcw->hash, rsp->hash.digest, sizeof(vcw->hash)); uatomic_set(&info->base, vcw); break; case SD_RES_NO_OBJ: vcw->object_found = false; break; default: sd_err("failed to read %" PRIx64 " from %s, %s", info->oid, addr_to_str(vcw->vnode->nid.addr, vcw->vnode->nid.port), sd_strerror(rsp->result)); exit(EXIT_FAILURE); } } static void vdi_hash_check_main(struct work *work) { struct vdi_check_work *vcw = container_of(work, struct vdi_check_work, work); struct vdi_check_info *info = vcw->info; info->refcnt--; if (info->refcnt > 0) return; if (info->base == NULL) { sd_err("no node has %" PRIx64, info->oid); exit(EXIT_FAILURE); } for (int i = 0; i < info->nr_copies; i++) { if (&info->vcw[i] == info->base) continue; /* need repair when object not found or consistency broken */ if (!info->vcw[i].object_found || memcmp(info->base->hash, info->vcw[i].hash, sizeof(info->base->hash)) != 0) { info->vcw[i].work.fn = vdi_repair_work; info->vcw[i].work.done = vdi_repair_main; info->refcnt++; queue_work(info->wq, &info->vcw[i].work); } } if (info->refcnt == 0) free_vdi_check_info(info); } static void queue_vdi_check_work(struct sd_inode *inode, uint64_t oid, uint64_t *done, struct work_queue *wq) { struct vdi_check_info *info; const struct sd_vnode *tgt_vnodes[SD_MAX_COPIES]; int nr_copies = inode->nr_copies; info = xzalloc(sizeof(*info) + sizeof(info->vcw[0]) * nr_copies); info->oid = oid; info->nr_copies = nr_copies; info->total = inode->vdi_size; info->done = done; info->wq = wq; oid_to_vnodes(sd_vnodes, sd_vnodes_nr, oid, nr_copies, tgt_vnodes); for (int i = 0; i < nr_copies; i++) { info->vcw[i].info = info; info->vcw[i].vnode = tgt_vnodes[i]; info->vcw[i].work.fn = vdi_hash_check_work; info->vcw[i].work.done = vdi_hash_check_main; info->refcnt++; queue_work(info->wq, &info->vcw[i].work); } } static int vdi_check(int argc, char **argv) { const char *vdiname = argv[optind++]; int ret, max_idx; uint64_t done = 0, oid; uint32_t vid; struct sd_inode *inode = xmalloc(sizeof(*inode)); struct work_queue *wq; ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, &vid, inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) { sd_err("FATAL: no inode objects"); goto out; } if (sd_nodes_nr < inode->nr_copies) { sd_err("ABORT: Not enough active nodes for consistency-check"); return EXIT_FAILURE; } wq = create_work_queue("vdi check", WQ_DYNAMIC); queue_vdi_check_work(inode, vid_to_vdi_oid(vid), NULL, wq); max_idx = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE); vdi_show_progress(done, inode->vdi_size); for (int idx = 0; idx < max_idx; idx++) { vid = inode->data_vdi_id[idx]; if (vid) { oid = vid_to_data_oid(vid, idx); queue_vdi_check_work(inode, oid, &done, wq); } else { done += SD_DATA_OBJ_SIZE; vdi_show_progress(done, inode->vdi_size); } } work_queue_wait(wq); fprintf(stdout, "finish check&repair %s\n", vdiname); return EXIT_SUCCESS; out: return ret; } /* vdi backup format */ #define VDI_BACKUP_FORMAT_VERSION 1 #define VDI_BACKUP_MAGIC 0x11921192 struct backup_hdr { uint32_t version; uint32_t magic; }; struct obj_backup { uint32_t idx; uint32_t offset; uint32_t length; uint32_t reserved; uint8_t data[SD_DATA_OBJ_SIZE]; }; /* discards redundant area from backup data */ static void compact_obj_backup(struct obj_backup *backup, uint8_t *from_data) { uint8_t *p1, *p2; p1 = backup->data; p2 = from_data; while (backup->length > 0 && memcmp(p1, p2, SECTOR_SIZE) == 0) { p1 += SECTOR_SIZE; p2 += SECTOR_SIZE; backup->offset += SECTOR_SIZE; backup->length -= SECTOR_SIZE; } p1 = backup->data + SD_DATA_OBJ_SIZE - SECTOR_SIZE; p2 = from_data + SD_DATA_OBJ_SIZE - SECTOR_SIZE; while (backup->length > 0 && memcmp(p1, p2, SECTOR_SIZE) == 0) { p1 -= SECTOR_SIZE; p2 -= SECTOR_SIZE; backup->length -= SECTOR_SIZE; } } static int get_obj_backup(int idx, uint32_t from_vid, uint32_t to_vid, struct obj_backup *backup) { int ret; uint8_t *from_data = xzalloc(SD_DATA_OBJ_SIZE); backup->idx = idx; backup->offset = 0; backup->length = SD_DATA_OBJ_SIZE; if (to_vid) { ret = sd_read_object(vid_to_data_oid(to_vid, idx), backup->data, SD_DATA_OBJ_SIZE, 0, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read object %" PRIx32 ", %d", to_vid, idx); return EXIT_FAILURE; } } else memset(backup->data, 0, SD_DATA_OBJ_SIZE); if (from_vid) { ret = sd_read_object(vid_to_data_oid(from_vid, idx), from_data, SD_DATA_OBJ_SIZE, 0, true); if (ret != SD_RES_SUCCESS) { sd_err("Failed to read object %" PRIx32 ", %d", from_vid, idx); return EXIT_FAILURE; } } compact_obj_backup(backup, from_data); free(from_data); return EXIT_SUCCESS; } static int vdi_backup(int argc, char **argv) { const char *vdiname = argv[optind++]; int ret = EXIT_SUCCESS, idx, nr_objs; struct sd_inode *from_inode = xzalloc(sizeof(*from_inode)); struct sd_inode *to_inode = xzalloc(sizeof(*to_inode)); struct backup_hdr hdr = { .version = VDI_BACKUP_FORMAT_VERSION, .magic = VDI_BACKUP_MAGIC, }; struct obj_backup *backup = xzalloc(sizeof(*backup)); if ((!vdi_cmd_data.snapshot_id && !vdi_cmd_data.snapshot_tag[0]) || (!vdi_cmd_data.from_snapshot_id && !vdi_cmd_data.from_snapshot_tag[0])) { sd_err("Please specify snapshots with '-F' and '-s' options"); ret = EXIT_USAGE; goto out; } ret = read_vdi_obj(vdiname, vdi_cmd_data.from_snapshot_id, vdi_cmd_data.from_snapshot_tag, NULL, from_inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, NULL, to_inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; nr_objs = DIV_ROUND_UP(to_inode->vdi_size, SD_DATA_OBJ_SIZE); ret = xwrite(STDOUT_FILENO, &hdr, sizeof(hdr)); if (ret < 0) { sd_err("failed to write backup header, %m"); ret = EXIT_SYSFAIL; goto out; } for (idx = 0; idx < nr_objs; idx++) { uint32_t from_vid = from_inode->data_vdi_id[idx]; uint32_t to_vid = to_inode->data_vdi_id[idx]; if (to_vid == 0 && from_vid == 0) continue; ret = get_obj_backup(idx, from_vid, to_vid, backup); if (ret != EXIT_SUCCESS) goto out; if (backup->length == 0) continue; ret = xwrite(STDOUT_FILENO, backup, sizeof(*backup) - sizeof(backup->data)); if (ret < 0) { sd_err("failed to write backup data, %m"); ret = EXIT_SYSFAIL; goto out; } ret = xwrite(STDOUT_FILENO, backup->data + backup->offset, backup->length); if (ret < 0) { sd_err("failed to write backup data, %m"); ret = EXIT_SYSFAIL; goto out; } } /* write end marker */ memset(backup, 0, sizeof(*backup) - sizeof(backup->data)); backup->idx = UINT32_MAX; ret = xwrite(STDOUT_FILENO, backup, sizeof(*backup) - sizeof(backup->data)); if (ret < 0) { sd_err("failed to write end marker, %m"); ret = EXIT_SYSFAIL; goto out; } fsync(STDOUT_FILENO); ret = EXIT_SUCCESS; out: free(from_inode); free(to_inode); free(backup); return ret; } /* restore backup data to vdi */ static int restore_obj(struct obj_backup *backup, uint32_t vid, struct sd_inode *parent_inode) { int ret; uint32_t parent_vid = parent_inode->data_vdi_id[backup->idx]; uint64_t parent_oid = 0; if (parent_vid) parent_oid = vid_to_data_oid(parent_vid, backup->idx); /* send a copy-on-write request */ ret = sd_write_object(vid_to_data_oid(vid, backup->idx), parent_oid, backup->data, backup->length, backup->offset, 0, parent_inode->nr_copies, true, true); if (ret != SD_RES_SUCCESS) return ret; return sd_write_object(vid_to_vdi_oid(vid), 0, &vid, sizeof(vid), SD_INODE_HEADER_SIZE + sizeof(vid) * backup->idx, 0, parent_inode->nr_copies, false, true); } static uint32_t do_restore(const char *vdiname, int snapid, const char *tag) { int ret; uint32_t vid; struct backup_hdr hdr; struct obj_backup *backup = xzalloc(sizeof(*backup)); struct sd_inode *inode = xzalloc(sizeof(*inode)); ret = xread(STDIN_FILENO, &hdr, sizeof(hdr)); if (ret != sizeof(hdr)) sd_err("failed to read backup header, %m"); if (hdr.version != VDI_BACKUP_FORMAT_VERSION || hdr.magic != VDI_BACKUP_MAGIC) { sd_err("The backup file is corrupted"); ret = EXIT_SYSFAIL; goto out; } ret = read_vdi_obj(vdiname, snapid, tag, NULL, inode, SD_INODE_SIZE); if (ret != EXIT_SUCCESS) goto out; ret = do_vdi_create(vdiname, inode->vdi_size, inode->vdi_id, &vid, false, inode->nr_copies); if (ret != EXIT_SUCCESS) { sd_err("Failed to read VDI"); goto out; } while (true) { ret = xread(STDIN_FILENO, backup, sizeof(*backup) - sizeof(backup->data)); if (ret != sizeof(*backup) - sizeof(backup->data)) { sd_err("failed to read backup data"); ret = EXIT_SYSFAIL; break; } if (backup->idx == UINT32_MAX) { ret = EXIT_SUCCESS; break; } ret = xread(STDIN_FILENO, backup->data, backup->length); if (ret != backup->length) { sd_err("failed to read backup data"); ret = EXIT_SYSFAIL; break; } ret = restore_obj(backup, vid, inode); if (ret != SD_RES_SUCCESS) { sd_err("failed to restore backup"); do_vdi_delete(vdiname, 0, NULL); ret = EXIT_FAILURE; break; } } out: free(backup); free(inode); return ret; } static int vdi_restore(int argc, char **argv) { const char *vdiname = argv[optind++]; int ret; char buf[SD_INODE_HEADER_SIZE] = {0}; struct sd_inode *inode_for_check = xzalloc(sizeof(*inode_for_check)); struct sd_inode *current_inode = xzalloc(sizeof(*current_inode)); struct sd_inode *parent_inode = (struct sd_inode *)buf; bool need_current_recovery = false; if (!vdi_cmd_data.snapshot_id && !vdi_cmd_data.snapshot_tag[0]) { sd_err("We can restore a backup file only to snapshots"); sd_err("Please specify the '-s' option"); ret = EXIT_USAGE; goto out; } ret = read_vdi_obj(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, NULL, inode_for_check, SD_INODE_SIZE); if (ret != SD_RES_SUCCESS) { sd_err("Snapshot ID %d or tag %s doesn't exist", vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag); goto out; } /* * delete the current vdi temporarily first to avoid making * the current state become snapshot */ ret = read_vdi_obj(vdiname, 0, "", NULL, current_inode, SD_INODE_HEADER_SIZE); if (ret != EXIT_SUCCESS) goto out; ret = sd_read_object(vid_to_vdi_oid(current_inode->parent_vdi_id), parent_inode, SD_INODE_HEADER_SIZE, 0, true); if (ret != SD_RES_SUCCESS) { printf("error\n"); goto out; } if (is_stdin_console()) { sd_err("stdin must be pipe"); ret = EXIT_USAGE; goto out; } ret = do_vdi_delete(vdiname, 0, NULL); if (ret != EXIT_SUCCESS) { sd_err("Failed to delete the current state"); goto out; } need_current_recovery = true; /* restore backup data */ ret = do_restore(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag); out: if (need_current_recovery) { int recovery_ret; /* recreate the current vdi object */ recovery_ret = do_vdi_create(vdiname, current_inode->vdi_size, current_inode->parent_vdi_id, NULL, true, current_inode->nr_copies); if (recovery_ret != EXIT_SUCCESS) { sd_err("failed to resume the current vdi"); ret = recovery_ret; } } free(current_inode); free(inode_for_check); return ret; } static int vdi_cache_flush(int argc, char **argv) { const char *vdiname = argv[optind++]; struct sd_req hdr; uint32_t vid; int ret = EXIT_SUCCESS; ret = find_vdi_name(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, &vid, 0); if (ret < 0) { sd_err("Failed to open VDI %s", vdiname); ret = EXIT_FAILURE; goto out; } sd_init_req(&hdr, SD_OP_FLUSH_VDI); hdr.obj.oid = vid_to_vdi_oid(vid); ret = send_light_req(&hdr, sdhost, sdport); if (ret) { sd_err("failed to execute request"); return EXIT_FAILURE; } out: return ret; } static int vdi_cache_delete(int argc, char **argv) { const char *vdiname = argv[optind++]; struct sd_req hdr; uint32_t vid; int ret = EXIT_SUCCESS; ret = find_vdi_name(vdiname, vdi_cmd_data.snapshot_id, vdi_cmd_data.snapshot_tag, &vid, 0); if (ret < 0) { sd_err("Failed to open VDI %s", vdiname); ret = EXIT_FAILURE; goto out; } sd_init_req(&hdr, SD_OP_DELETE_CACHE); hdr.obj.oid = vid_to_vdi_oid(vid); ret = send_light_req(&hdr, sdhost, sdport); if (ret) { sd_err("failed to execute request"); return EXIT_FAILURE; } out: return ret; } static int vid_to_name_tag(uint32_t vid, char *name, char *tag) { struct sd_inode inode; int ret; ret = sd_read_object(vid_to_vdi_oid(vid), &inode, SD_INODE_HEADER_SIZE, 0, true); if (ret != SD_RES_SUCCESS) return ret; pstrcpy(name, SD_MAX_VDI_LEN, inode.name); pstrcpy(tag, SD_MAX_VDI_TAG_LEN, inode.tag); return SD_RES_SUCCESS; } static int vdi_cache_info(int argc, char **argv) { struct object_cache_info info = {}; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char size_str[UINT64_DECIMAL_SIZE], used_str[UINT64_DECIMAL_SIZE]; int ret, i; sd_init_req(&hdr, SD_OP_GET_CACHE_INFO); hdr.data_length = sizeof(info); ret = dog_exec_req(sdhost, sdport, &hdr, &info); if (ret < 0) return EXIT_SYSFAIL; if (rsp->result != SD_RES_SUCCESS) { sd_err("failed to get cache infomation: %s", sd_strerror(rsp->result)); return EXIT_FAILURE; } fprintf(stdout, "Name\tTag\tTotal\tDirty\tClean\n"); for (i = 0; i < info.count; i++) { char total_str[UINT64_DECIMAL_SIZE], dirty_str[UINT64_DECIMAL_SIZE], clean_str[UINT64_DECIMAL_SIZE]; uint64_t total = info.caches[i].total * SD_DATA_OBJ_SIZE, dirty = info.caches[i].dirty * SD_DATA_OBJ_SIZE, clean = total - dirty; char name[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; size_to_str(total, total_str, sizeof(total_str)); size_to_str(dirty, dirty_str, sizeof(dirty_str)); size_to_str(clean, clean_str, sizeof(clean_str)); ret = vid_to_name_tag(info.caches[i].vid, name, tag); if (ret != SD_RES_SUCCESS) return EXIT_FAILURE; fprintf(stdout, "%s\t%s\t%s\t%s\t%s\n", name, tag, total_str, dirty_str, clean_str); } size_to_str(info.size, size_str, sizeof(size_str)); size_to_str(info.used, used_str, sizeof(used_str)); fprintf(stdout, "\nCache size %s, used %s\n", size_str, used_str); return EXIT_SUCCESS; } static struct subcommand vdi_cache_cmd[] = { {"flush", NULL, NULL, "flush the cache of the vdi specified.", NULL, CMD_NEED_ARG, vdi_cache_flush}, {"delete", NULL, NULL, "delete the cache of the vdi specified in all nodes.", NULL, CMD_NEED_ARG, vdi_cache_delete}, {"info", NULL, NULL, "show usage of the cache", NULL, 0, vdi_cache_info}, {NULL,}, }; static int vdi_cache(int argc, char **argv) { return do_generic_subcommand(vdi_cache_cmd, argc, argv); } static struct subcommand vdi_cmd[] = { {"check", "", "saph", "check and repair image's consistency", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_check, vdi_options}, {"create", " ", "Pcaphrv", "create an image", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_create, vdi_options}, {"snapshot", "", "saphrv", "create a snapshot", NULL, CMD_NEED_ARG, vdi_snapshot, vdi_options}, {"clone", " ", "sPcaphrv", "clone an image", NULL, CMD_NEED_ARG, vdi_clone, vdi_options}, {"delete", "", "saph", "delete an image", NULL, CMD_NEED_ARG, vdi_delete, vdi_options}, {"rollback", "", "saphfrv", "rollback to a snapshot", NULL, CMD_NEED_ARG, vdi_rollback, vdi_options}, {"list", "[vdiname]", "aprh", "list images", NULL, 0, vdi_list, vdi_options}, {"tree", NULL, "aph", "show images in tree view format", NULL, 0, vdi_tree, vdi_options}, {"graph", NULL, "aph", "show images in Graphviz dot format", NULL, 0, vdi_graph, vdi_options}, {"object", "", "isaph", "show object information in the image", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_object, vdi_options}, {"track", "", "isaph", "show the object epoch trace in the image", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_track, vdi_options}, {"setattr", " [value]", "dxaph", "set a VDI attribute", NULL, CMD_NEED_ARG, vdi_setattr, vdi_options}, {"getattr", " ", "aph", "get a VDI attribute", NULL, CMD_NEED_ARG, vdi_getattr, vdi_options}, {"resize", " ", "aph", "resize an image", NULL, CMD_NEED_ARG, vdi_resize, vdi_options}, {"read", " [ []]", "saph", "read data from an image", NULL, CMD_NEED_ARG, vdi_read, vdi_options}, {"write", " [ []]", "apwh", "write data to an image", NULL, CMD_NEED_ARG, vdi_write, vdi_options}, {"backup", " ", "sFaph", "create an incremental backup between two snapshots", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_backup, vdi_options}, {"restore", " ", "saph", "restore snapshot images from a backup", NULL, CMD_NEED_NODELIST|CMD_NEED_ARG, vdi_restore, vdi_options}, {"cache", "", "saph", "Run 'dog vdi cache' for more information", vdi_cache_cmd, CMD_NEED_ARG, vdi_cache, vdi_options}, {NULL,}, }; static int vdi_parser(int ch, char *opt) { char *p; int nr_copies; switch (ch) { case 'P': vdi_cmd_data.prealloc = true; break; case 'i': vdi_cmd_data.index = strtol(opt, &p, 10); if (opt == p) { sd_err("The index must be an integer"); exit(EXIT_FAILURE); } break; case 's': vdi_cmd_data.snapshot_id = strtol(opt, &p, 10); if (opt == p) { vdi_cmd_data.snapshot_id = 0; pstrcpy(vdi_cmd_data.snapshot_tag, sizeof(vdi_cmd_data.snapshot_tag), opt); } else if (vdi_cmd_data.snapshot_id == 0) { fprintf(stderr, "The snapshot id must be larger than zero\n"); exit(EXIT_FAILURE); } break; case 'x': vdi_cmd_data.exclusive = true; break; case 'd': vdi_cmd_data.delete = true; break; case 'w': vdi_cmd_data.writeback = true; break; case 'c': nr_copies = strtol(opt, &p, 10); if (opt == p || nr_copies < 0 || nr_copies > SD_MAX_COPIES) { sd_err("Invalid copies number, must be " "an integer between 0 and %d", SD_MAX_COPIES); exit(EXIT_FAILURE); } vdi_cmd_data.nr_copies = nr_copies; break; case 'F': vdi_cmd_data.from_snapshot_id = strtol(opt, &p, 10); if (opt == p) { vdi_cmd_data.from_snapshot_id = 0; pstrcpy(vdi_cmd_data.from_snapshot_tag, sizeof(vdi_cmd_data.from_snapshot_tag), opt); } break; case 'f': vdi_cmd_data.force = true; break; } return 0; } struct command vdi_command = { "vdi", vdi_cmd, vdi_parser }; sheepdog-0.7.5/include/000077500000000000000000000000001223630776600147605ustar00rootroot00000000000000sheepdog-0.7.5/include/Makefile.am000066400000000000000000000004201223630776600170100ustar00rootroot00000000000000MAINTAINERCLEANFILES = Makefile.in config.h.in noinst_HEADERS = bitops.h event.h logger.h sheepdog_proto.h util.h \ list.h net.h sheep.h exits.h strbuf.h rbtree.h \ sha1.h option.h internal_proto.h shepherd.h work.h \ sockfd_cache.h compiler.h sheepdog-0.7.5/include/bitops.h000066400000000000000000000106731223630776600164400ustar00rootroot00000000000000#ifndef __BITOPS_H__ #define __BITOPS_H__ #include #include "util.h" #define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) #define BITS_PER_BYTE 8 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) #define DECLARE_BITMAP(name, bits) \ unsigned long name[BITS_TO_LONGS(bits)] #define BITS_PER_LONG (BITS_PER_BYTE * sizeof(long)) #define __ffs(x) (x ? __builtin_ffsl(x) - 1 : 0) #define ffz(x) __ffs(~(x)) #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) /* * Iterate over a bitmap * * @nr: the bit number to use as a loop cursor * @addr: the bitmap you iterate over * @bits: the number of bits this bitmap contains */ #define FOR_EACH_BIT(nr, addr, bits) \ for (nr = find_next_bit((addr), (bits), 0); \ nr < (bits); \ nr = find_next_bit((addr), (bits), nr + 1)) /* * Change the size of allocated bitmap * * This doesn't change the contents of the old bitmap pointed to by `ptr`, and * initializes the newly allocated area with zeros. */ static inline unsigned long *alloc_bitmap(unsigned long *old_bmap, size_t old_bits, size_t new_bits) { size_t old_size = BITS_TO_LONGS(old_bits) * sizeof(long); size_t new_size = BITS_TO_LONGS(new_bits) * sizeof(long); unsigned long *new_bmap = xrealloc(old_bmap, new_size); if (old_bits < new_bits) memset((char *)new_bmap + old_size, 0, new_size - old_size); return new_bmap; } static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG-1); unsigned long tmp; if (offset >= size) return size; size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp |= ~0UL >> (BITS_PER_LONG - offset); if (size < BITS_PER_LONG) goto found_first; if (~tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG-1)) { tmp = *(p++); if (~tmp) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = *p; found_first: tmp |= ~0UL << size; if (tmp == ~0UL) /* Are any bits zero? */ return result + size; /* Nope. */ found_middle: return result + ffz(tmp); } static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG-1); unsigned long tmp; if (offset >= size) return size; size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp &= (~0UL << offset); if (size < BITS_PER_LONG) goto found_first; if (tmp) goto found_middle; size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size & ~(BITS_PER_LONG-1)) { tmp = *(p++); if (tmp) goto found_middle; result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) return result; tmp = *p; found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); } static inline void set_bit(int nr, unsigned long *addr) { addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG); } static inline void atomic_set_bit(int nr, unsigned long *addr) { uatomic_or(addr + nr / BITS_PER_LONG, 1UL << (nr % BITS_PER_LONG)); } static inline int test_bit(unsigned int nr, const unsigned long *addr) { return ((1UL << (nr % BITS_PER_LONG)) & (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0; } static inline void clear_bit(unsigned int nr, unsigned long *addr) { addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG)); } /* * fls64 - find last set bit in a 64-bit word * @x: the word to search * * This is defined in a similar way as the libc and compiler builtin * ffsll, but returns the position of the most significant set bit. * * fls64(value) returns 0 if value is 0 or the position of the last * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ #if __SIZEOF_LONG__ == 4 static __always_inline int fls64(uint64_t x) { uint32_t h = x >> 32; if (x == 0) return 0; if (h) return 64 - __builtin_clzl(h); return 32 - __builtin_clzl(x); } #elif __SIZEOF_LONG__ == 8 static __always_inline int fls64(uint64_t x) { if (x == 0) return 0; return 64 - __builtin_clzl(x); } #else #error __SIZEOF_LONG__ not 4 or 8 #endif #endif /* __BITOPS_H__ */ sheepdog-0.7.5/include/compiler.h000066400000000000000000000014741223630776600167510ustar00rootroot00000000000000/* * Copyright (C) 2009-2013 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef SD_COMPILER_H #define SD_COMPILER_H #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #define __packed __attribute((packed)) #define __printf(a, b) __attribute__((format(printf, a, b))) /* Force a compilation error if the condition is true */ #define BUILD_BUG_ON(condition) ((void)sizeof(struct { int: -!!(condition); })) #endif /* SD_COMPILER_H */ sheepdog-0.7.5/include/event.h000066400000000000000000000014421223630776600162530ustar00rootroot00000000000000#ifndef __EVENT_H__ #define __EVENT_H__ #include "list.h" #include struct event_info; typedef void (*event_handler_t)(int fd, int events, void *data); int init_event(int nr); int register_event_prio(int fd, event_handler_t h, void *data, int prio); void unregister_event(int fd); int modify_event(int fd, unsigned int events); void event_loop(int timeout); void event_loop_prio(int timeout); void event_force_refresh(void); struct timer { void (*callback)(void *); void *data; }; void add_timer(struct timer *t, unsigned int mseconds); #define EVENT_PRIO_MAX INT_MAX #define EVENT_PRIO_DEFAULT 0 #define EVENT_PRIO_MIN INT_MIN static inline int register_event(int fd, event_handler_t h, void *data) { return register_event_prio(fd, h, data, EVENT_PRIO_DEFAULT); } #endif sheepdog-0.7.5/include/exits.h000066400000000000000000000010041223630776600162600ustar00rootroot00000000000000#ifndef __EXITS_H__ #define __EXITS_H__ #define EXIT_SUCCESS 0 /* command executed successfully */ #define EXIT_FAILURE 1 /* command failed to execute */ #define EXIT_SYSFAIL 2 /* something is wrong with the cluster or local host */ #define EXIT_EXISTS 3 /* the object already exists so cannot be created */ #define EXIT_FULL 4 /* no more space is left in the cluster */ #define EXIT_MISSING 5 /* the specified object does not exist */ #define EXIT_USAGE 64 /* invalid command, arguments or options */ #endif sheepdog-0.7.5/include/internal_proto.h000066400000000000000000000140041223630776600201670ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __INTERNAL_PROTO_H__ #define __INTERNAL_PROTO_H__ /* * This file specified the sheepdog-internal protocol, which is spoken between * sheepdog daemons, as well as between dog and sheepdog daemon for internal * operations. */ #include #include #include "sheepdog_proto.h" #define SD_SHEEP_PROTO_VER 0x08 #define SD_DEFAULT_COPIES 3 #define SD_MAX_COPIES 8 #define SD_MAX_NODES 1024 #define SD_DEFAULT_VNODES 64 #define SD_MAX_VNODES 65536 /* * Operations with opcodes above 0x80 are considered part of the inter-sheep * include sheep-dog protocol and are versioned using SD_SHEEP_PROTO_VER * instead of SD_PROTO_VER. * * These same applies for the above 0x80 flags and error values below. */ #define SD_OP_GET_NODE_LIST 0x82 #define SD_OP_MAKE_FS 0x84 #define SD_OP_SHUTDOWN 0x85 #define SD_OP_STAT_SHEEP 0x86 #define SD_OP_STAT_CLUSTER 0x87 #define SD_OP_GET_VDI_ATTR 0x89 #define SD_OP_FORCE_RECOVER 0x8a #define SD_OP_GET_STORE_LIST 0x90 #define SD_OP_SNAPSHOT 0x91 #define SD_OP_RESTORE 0x92 #define SD_OP_GET_SNAP_FILE 0x93 #define SD_OP_CLEANUP 0x94 #define SD_OP_TRACE_STATUS 0x95 #define SD_OP_TRACE_READ_BUF 0x96 #define SD_OP_STAT_RECOVERY 0x97 #define SD_OP_FLUSH_DEL_CACHE 0x98 #define SD_OP_NOTIFY_VDI_DEL 0x99 #define SD_OP_KILL_NODE 0x9A #define SD_OP_TRACE_ENABLE 0x9B #define SD_OP_TRACE_DISABLE 0x9C #define SD_OP_GET_OBJ_LIST 0xA1 #define SD_OP_GET_EPOCH 0xA2 #define SD_OP_CREATE_AND_WRITE_PEER 0xA3 #define SD_OP_READ_PEER 0xA4 #define SD_OP_WRITE_PEER 0xA5 #define SD_OP_REMOVE_PEER 0xA6 /* #define SD_OP_SET_CACHE_SIZE 0xA7 deleted */ #define SD_OP_ENABLE_RECOVER 0xA8 #define SD_OP_DISABLE_RECOVER 0xA9 #define SD_OP_GET_VDI_COPIES 0xAB #define SD_OP_COMPLETE_RECOVERY 0xAC #define SD_OP_FLUSH_NODES 0xAD #define SD_OP_FLUSH_PEER 0xAE #define SD_OP_NOTIFY_VDI_ADD 0xAF #define SD_OP_DELETE_CACHE 0xB0 #define SD_OP_MD_INFO 0xB1 #define SD_OP_MD_PLUG 0xB2 #define SD_OP_MD_UNPLUG 0xB3 #define SD_OP_GET_HASH 0xB4 #define SD_OP_REWEIGHT 0xB5 #define SD_OP_GET_CACHE_INFO 0xB6 /* internal flags for hdr.flags, must be above 0x80 */ #define SD_FLAG_CMD_RECOVERY 0x0080 /* flags for VDI attribute operations */ #define SD_FLAG_CMD_CREAT 0x0100 #define SD_FLAG_CMD_EXCL 0x0200 #define SD_FLAG_CMD_DEL 0x0400 /* internal error return values, must be above 0x80 */ #define SD_RES_OLD_NODE_VER 0x81 /* Request has an old epoch */ #define SD_RES_NEW_NODE_VER 0x82 /* Request has a new epoch */ #define SD_RES_NOT_FORMATTED 0x83 /* Sheepdog is not formatted yet */ #define SD_RES_INVALID_CTIME 0x84 /* Creation time of sheepdog is different */ #define SD_RES_INVALID_EPOCH 0x85 /* Invalid epoch */ #define SD_RES_NETWORK_ERROR 0x86 /* Network error between sheep */ #define SD_RES_NO_CACHE 0x87 /* No cache object found */ #define SD_RES_BUFFER_SMALL 0x88 /* The buffer is too small */ #define SD_RES_FORCE_RECOVER 0x89 /* Users should not force recover this cluster */ #define SD_RES_NO_STORE 0x8A /* No targeted backend store */ #define SD_RES_NO_SUPPORT 0x8B /* Operation is not supported by backend store */ #define SD_RES_NODE_IN_RECOVERY 0x8C /* Targeted node is in recovery */ #define SD_RES_KILLED 0x8D /* Node is killed */ #define SD_RES_OID_EXIST 0x8E /* Object ID exists already */ #define SD_RES_AGAIN 0x8F /* Ask to try again */ #define SD_RES_STALE_OBJ 0x90 /* Object may be stale */ #define SD_RES_CLUSTER_ERROR 0x91 /* Cluster driver error */ enum sd_status { SD_STATUS_OK = 1, SD_STATUS_WAIT, SD_STATUS_SHUTDOWN, SD_STATUS_KILLED, }; struct node_id { uint8_t addr[16]; uint16_t port; uint8_t io_addr[16]; uint16_t io_port; uint8_t pad[4]; }; #define SD_NODE_SIZE 56 struct sd_node { struct node_id nid; uint16_t nr_vnodes; uint32_t zone; uint64_t space; }; /* * A joining sheep multicasts the local cluster info. Then, the existing nodes * reply the latest cluster info which is unique among all of the nodes. */ struct cluster_info { uint8_t proto_ver; /* the version number of the internal protocol */ uint8_t disable_recovery; int16_t nr_nodes; uint32_t epoch; uint64_t ctime; uint16_t flags; uint8_t nr_copies; enum sd_status status : 8; uint32_t __pad; uint8_t store[STORE_LEN]; /* node list at cluster_info->epoch */ struct sd_node nodes[SD_MAX_NODES]; }; struct epoch_log { uint64_t ctime; uint64_t time; /* treated as time_t */ uint32_t epoch; uint32_t nr_nodes; uint8_t disable_recovery; uint8_t __pad[3]; struct sd_node nodes[SD_MAX_NODES]; }; struct vdi_op_message { struct sd_req req; struct sd_rsp rsp; uint8_t data[0]; }; struct md_info { int idx; uint64_t free; uint64_t used; char path[PATH_MAX]; }; #define MD_MAX_DISK 64 /* FIXME remove roof and make it dynamic */ struct sd_md_info { struct md_info disk[MD_MAX_DISK]; int nr; }; static inline __attribute__((used)) void __sd_epoch_format_build_bug_ons(void) { /* never called, only for checking BUILD_BUG_ON()s */ BUILD_BUG_ON(sizeof(struct sd_node) != SD_NODE_SIZE); } enum rw_state { RW_PREPARE_LIST, /* the recovery thread is preparing object list */ RW_RECOVER_OBJ, /* the thread is recoering objects */ RW_NOTIFY_COMPLETION, /* the thread is notifying recovery completion */ }; struct recovery_state { uint8_t in_recovery; enum rw_state state; uint64_t nr_finished; uint64_t nr_total; }; #define CACHE_MAX 1024 struct cache_info { uint32_t vid; uint32_t dirty; uint32_t total; }; struct object_cache_info { uint64_t size; uint64_t used; struct cache_info caches[CACHE_MAX]; int count; }; #endif /* __INTERNAL_PROTO_H__ */ sheepdog-0.7.5/include/list.h000066400000000000000000000143171223630776600161120ustar00rootroot00000000000000#ifndef __LIST_H__ #define __LIST_H__ /* taken from linux kernel */ #include #define container_of(ptr, type, member) ({ \ const typeof(((type *)0)->member) *__mptr = (ptr); \ (type *)((char *)__mptr - offsetof(type, member)); }) struct list_head { struct list_head *next, *prev; }; #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) static inline void INIT_LIST_HEAD(struct list_head *list) { list->next = list; list->prev = list; } #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) static inline int list_empty(const struct list_head *head) { return head->next == head; } #define list_entry(ptr, type, member) \ container_of(ptr, type, member) #define list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) #define list_for_each_entry(pos, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member), \ n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; } static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } static inline void __list_del(struct list_head *prev, struct list_head *next) { next->prev = prev; prev->next = next; } static inline void __list_del_entry(struct list_head *entry) { __list_del(entry->prev, entry->next); } static inline void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->next = entry->prev = NULL; } static inline void list_del_init(struct list_head *entry) { __list_del_entry(entry); INIT_LIST_HEAD(entry); } static inline void list_move(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add(list, head); } static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del_entry(list); list_add_tail(list, head); } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /* hlist, mostly useful for hash tables */ #define LIST_POISON1 ((void *) 0x00100100) #define LIST_POISON2 ((void *) 0x00200200) struct hlist_head { struct hlist_node *first; }; struct hlist_node { struct hlist_node *next, **pprev; }; #define HLIST_HEAD_INIT { .first = NULL } #define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } #define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) static inline void INIT_HLIST_NODE(struct hlist_node *h) { h->next = NULL; h->pprev = NULL; } static inline int hlist_unhashed(const struct hlist_node *h) { return !h->pprev; } static inline int hlist_empty(const struct hlist_head *h) { return !h->first; } static inline void __hlist_del(struct hlist_node *n) { struct hlist_node *next = n->next; struct hlist_node **pprev = n->pprev; *pprev = next; if (next) next->pprev = pprev; } static inline void hlist_del(struct hlist_node *n) { __hlist_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; if (first) first->pprev = &n->next; h->first = n; n->pprev = &h->first; } /* next must be != NULL */ static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) { n->pprev = next->pprev; n->next = next; next->pprev = &n->next; *(n->pprev) = n; } static inline void hlist_add_after(struct hlist_node *n, struct hlist_node *next) { next->next = n->next; n->next = next; next->pprev = &n->next; if (next->next) next->next->pprev = &next->next; } #define hlist_entry(ptr, type, member) container_of(ptr, type, member) #define hlist_for_each(pos, head) \ for (pos = (head)->first; pos ; pos = pos->next) #define hlist_for_each_safe(pos, n, head) \ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); pos = n) /* * hlist_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry(tpos, pos, head, member) \ for (pos = (head)->first; \ pos && ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = pos->next) /* * hlist_for_each_entry_safe - iterate over list of given type safe against * removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @n: another &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_safe(tpos, pos, n, head, member) \ for (pos = (head)->first; \ pos && ({ n = pos->next; 1; }) && \ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ pos = n) void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)); #endif /* __LIST_H__ */ sheepdog-0.7.5/include/logger.h000066400000000000000000000052151223630776600164130ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * This code is based on log.h from Linux target framework (tgt). * Copyright (C) 2004 Dmitry Yusupov, Alex Aizman */ #ifndef LOGGER_H #define LOGGER_H #include #include #include "compiler.h" #define LOG_SPACE_SIZE (1 * 1024 * 1024) #define LOG_SPACE_DEBUG_SIZE (32 * 1024 * 1024) #define MAX_MSG_SIZE 1024 #define MAX_THREAD_NAME_LEN 20 struct logger_user_info { int port; }; extern int sd_log_level; void early_log_init(const char *format_name, struct logger_user_info *user_info); int log_init(const char *progname, bool to_stdout, int level, char *outfile); void log_close(void); void dump_logmsg(void *); void log_write(int prio, const char *func, int line, const char *fmt, ...) __printf(4, 5); void set_thread_name(const char *name, bool show_idx); void get_thread_name(char *name); #define sd_dump_variable(var) ({ \ __sd_dump_variable(#var); \ }) int __sd_dump_variable(const char *var); void sd_backtrace(void); /* sheep log priorities, comliant with syslog spec */ #define SDOG_EMERG LOG_EMERG #define SDOG_ALERT LOG_ALERT #define SDOG_CRIT LOG_CRIT #define SDOG_ERR LOG_ERR #define SDOG_WARNING LOG_WARNING #define SDOG_NOTICE LOG_NOTICE #define SDOG_INFO LOG_INFO #define SDOG_DEBUG LOG_DEBUG #define sd_emerg(fmt, args...) \ log_write(SDOG_EMERG, __func__, __LINE__, fmt, ##args) #define sd_alert(fmt, args...) \ log_write(SDOG_ALERT, __func__, __LINE__, fmt, ##args) #define sd_crit(fmt, args...) \ log_write(SDOG_CRIT, __func__, __LINE__, fmt, ##args) #define sd_err(fmt, args...) \ log_write(SDOG_ERR, __func__, __LINE__, fmt, ##args) #define sd_warn(fmt, args...) \ log_write(SDOG_WARNING, __func__, __LINE__, fmt, ##args) #define sd_notice(fmt, args...) \ log_write(SDOG_NOTICE, __func__, __LINE__, fmt, ##args) #define sd_info(fmt, args...) \ log_write(SDOG_INFO, __func__, __LINE__, fmt, ##args) /* * 'args' must not contain an operation/function with a side-effect. It won't * be evaluated when the log level is not SDOG_DEBUG. */ #define sd_debug(fmt, args...) \ ({ \ if (unlikely(sd_log_level == SDOG_DEBUG)) \ log_write(SDOG_DEBUG, __func__, __LINE__, fmt, ##args); \ }) #define panic(fmt, args...) \ ({ \ sd_emerg("PANIC: " fmt, ##args); \ abort(); \ }) #endif /* LOG_H */ sheepdog-0.7.5/include/net.h000066400000000000000000000050411223630776600157170ustar00rootroot00000000000000#ifndef __NET_H__ #define __NET_H__ #include #include #include "sheepdog_proto.h" /* * We can't always retry because if only IO NIC is down, we'll retry for ever. * * We observed that for a busy node, the response could be as long as 15s, so * wait 30s would be a safe value. Even we are false timeouted, the gateway will * retry the request and sockfd cache module will repair the false-closes. */ #define MAX_POLLTIME 30 /* seconds */ #define POLL_TIMEOUT 5 /* seconds */ #define MAX_RETRY_COUNT (MAX_POLLTIME / POLL_TIMEOUT) enum conn_state { C_IO_HEADER = 0, C_IO_DATA_INIT, C_IO_DATA, C_IO_END, C_IO_CLOSED, }; struct connection { int fd; unsigned int events; uint16_t port; char ipstr[INET6_ADDRSTRLEN]; enum conn_state c_rx_state; int rx_length; void *rx_buf; struct sd_req rx_hdr; enum conn_state c_tx_state; int tx_length; void *tx_buf; struct sd_rsp tx_hdr; }; int conn_tx_off(struct connection *conn); int conn_tx_on(struct connection *conn); int conn_rx_off(struct connection *conn); int conn_rx_on(struct connection *conn); bool is_conn_dead(const struct connection *conn); int do_read(int sockfd, void *buf, int len, bool (*need_retry)(uint32_t), uint32_t, uint32_t); int rx(struct connection *conn, enum conn_state next_state); int tx(struct connection *conn, enum conn_state next_state); int connect_to(const char *name, int port); int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen, bool (*need_retry)(uint32_t), uint32_t, uint32_t); int exec_req(int sockfd, struct sd_req *hdr, void *, bool (*need_retry)(uint32_t), uint32_t, uint32_t); int create_listen_ports(const char *bindaddr, int port, int (*callback)(int fd, void *), void *data); int create_unix_domain_socket(const char *unix_path, int (*callback)(int, void *), void *data); const char *addr_to_str(const uint8_t *addr, uint16_t port); uint8_t *str_to_addr(const char *ipstr, uint8_t *addr); char *sockaddr_in_to_str(struct sockaddr_in *sockaddr); int set_nonblocking(int fd); int set_nodelay(int fd); int set_keepalive(int fd); int set_snd_timeout(int fd); int set_rcv_timeout(int fd); int get_local_addr(uint8_t *bytes); bool inetaddr_is_valid(char *addr); int do_writev2(int fd, void *hdr, size_t hdr_len, void *body, size_t body_len); /* for typical usage of do_writev2() */ #define writev2(fd, hdr, body, body_len) \ do_writev2(fd, hdr, sizeof(*hdr), body, body_len) static inline int connect_to_addr(const uint8_t *addr, int port) { return connect_to(addr_to_str(addr, 0), port); } #endif sheepdog-0.7.5/include/option.h000066400000000000000000000015761223630776600164520ustar00rootroot00000000000000/* * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SD_OPTION_H__ #define __SD_OPTION_H__ #include #include struct sd_option { int ch; const char *name; bool has_arg; const char *desc; const char *help; }; char *build_short_options(const struct sd_option *opts); struct option *build_long_options(const struct sd_option *opts); const char *option_get_help(const struct sd_option *, int); #define sd_for_each_option(opt, opts) \ for (opt = (opts); opt->name; opt++) #endif /* __SD_OPTION_H__ */ sheepdog-0.7.5/include/rbtree.h000066400000000000000000000045711223630776600164230ustar00rootroot00000000000000#ifndef __RBTREE_H_ #define __RBTREE_H_ struct rb_node { unsigned long rb_parent_color; #define RB_RED 0 #define RB_BLACK 1 struct rb_node *rb_right; struct rb_node *rb_left; }; struct rb_root { struct rb_node *rb_node; }; #define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) #define rb_color(r) ((r)->rb_parent_color & 1) #define rb_is_red(r) (!rb_color(r)) #define rb_is_black(r) rb_color(r) #define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) #define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; } static inline void rb_set_color(struct rb_node *rb, int color) { rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; } #define RB_ROOT { .rb_node = NULL } static inline void INIT_RB_ROOT(struct rb_root *root) { root->rb_node = NULL; } #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) #define RB_EMPTY_NODE(node) (rb_parent(node) == node) #define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) static inline void rb_init_node(struct rb_node *rb) { rb->rb_parent_color = 0; rb->rb_right = NULL; rb->rb_left = NULL; RB_CLEAR_NODE(rb); } void rb_insert_color(struct rb_node *, struct rb_root *); void rb_erase(struct rb_node *, struct rb_root *); typedef void (*rb_augment_f)(struct rb_node *node, void *data); void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data); struct rb_node *rb_augment_erase_begin(struct rb_node *node); void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data); /* Find logical next and previous nodes in a tree */ struct rb_node *rb_next(const struct rb_node *); struct rb_node *rb_prev(const struct rb_node *); struct rb_node *rb_first(const struct rb_root *); struct rb_node *rb_last(const struct rb_root *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } #endif /* __RBTREE_H_ */ sheepdog-0.7.5/include/sha1.h000066400000000000000000000015061223630776600157670ustar00rootroot00000000000000/* * sha1.h - SHA1 Secure Hash Algorithm used for CHAP authentication. * copied from the Linux kernel's Cryptographic API and slightly adjusted to * fit IET's needs * * This file is (c) 2004 Xiranet Communications GmbH * and licensed under the GPL. */ #ifndef SHA1_H #define SHA1_H #include #include #include #define SHA1_DIGEST_SIZE 20 #define SHA1_BLOCK_SIZE 64 struct sha1_ctx { uint64_t count; uint32_t state[SHA1_DIGEST_SIZE / 4]; uint8_t buffer[SHA1_BLOCK_SIZE]; }; void sha1_init(void *ctx); void sha1_update(void *ctx, const uint8_t *data, unsigned int len); void sha1_final(void *ctx, uint8_t *out); const char *sha1_to_hex(const unsigned char *sha1); void sha1_from_buffer(const void *buf, size_t size, unsigned char *sha1); #endif sheepdog-0.7.5/include/sheep.h000066400000000000000000000220671223630776600162440ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SHEEP_H__ #define __SHEEP_H__ #include #include "internal_proto.h" #include "util.h" #include "bitops.h" #include "list.h" #include "net.h" struct sd_vnode { struct node_id nid; uint16_t node_idx; uint32_t zone; uint64_t id; }; struct vnode_info { struct sd_vnode vnodes[SD_MAX_VNODES]; int nr_vnodes; struct sd_node nodes[SD_MAX_NODES]; int nr_nodes; int nr_zones; refcnt_t refcnt; }; #define TRACE_GRAPH_ENTRY 0x01 #define TRACE_GRAPH_RETURN 0x02 #define TRACE_FNAME_LEN 36 #define TRACE_THREAD_LEN MAX_THREAD_NAME_LEN struct trace_graph_item { char tname[TRACE_THREAD_LEN]; int type; char fname[TRACE_FNAME_LEN]; int depth; uint64_t entry_time; uint64_t return_time; }; static inline void sd_init_req(struct sd_req *req, uint8_t opcode) { memset(req, 0, sizeof(*req)); req->opcode = opcode; req->proto_ver = opcode < 0x80 ? SD_PROTO_VER : SD_SHEEP_PROTO_VER; } static inline int same_zone(const struct sd_vnode *e, int n1, int n2) { return e[n1].zone == e[n2].zone; } /* Get the first vnode's index which is matching the OID */ static inline int get_vnode_first_idx(const struct sd_vnode *entries, int nr_entries, uint64_t oid) { uint64_t id = fnv_64a_buf(&oid, sizeof(oid), FNV1A_64_INIT); int start, end, pos; assert(nr_entries > 0); start = 0; end = nr_entries - 1; if (id > entries[end].id || id < entries[start].id) return (end + 1) % nr_entries; for (;;) { pos = (end - start) / 2 + start; if (entries[pos].id < id) { if (entries[pos + 1].id >= id) return (pos + 1) % nr_entries; start = pos; } else end = pos; } } /* Get next vnode's index according to the PREV_IDXS */ static inline int get_vnode_next_idx(const struct sd_vnode *entries, int nr_entries, int *prev_idxs, int nr_prev_idxs) { int i, idx, first_idx; bool found; first_idx = prev_idxs[0]; idx = prev_idxs[nr_prev_idxs - 1]; for (;;) { idx = (idx + 1) % nr_entries; if (unlikely(idx == first_idx)) panic("can't find next new idx"); for (found = false, i = 0; i < nr_prev_idxs; i++) { if (same_zone(entries, idx, prev_idxs[i])) { found = true; break; } } if (!found) return idx; } } /* Get the n'th vnode's index which is matching the OID */ static inline int get_vnode_nth_idx(const struct sd_vnode *entries, int nr_entries, uint64_t oid, int nth) { int nr_idxs = 0, idxs[SD_MAX_COPIES]; idxs[nr_idxs++] = get_vnode_first_idx(entries, nr_entries, oid); if (!nth) return idxs[nth]; while (nr_idxs <= nth) { idxs[nr_idxs] = get_vnode_next_idx(entries, nr_entries, idxs, nr_idxs); nr_idxs++; } return idxs[nth]; } static inline const struct sd_vnode *oid_to_vnode(const struct sd_vnode *entries, int nr_entries, uint64_t oid, int copy_idx) { int idx = get_vnode_nth_idx(entries, nr_entries, oid, copy_idx); return &entries[idx]; } static inline const struct sd_node *oid_to_node(const struct sd_vnode *entries, int nr_entries, uint64_t oid, int copy_idx, const struct sd_node *all_nodes) { const struct sd_vnode *vnode; vnode = oid_to_vnode(entries, nr_entries, oid, copy_idx); return &all_nodes[vnode->node_idx]; } static inline void oid_to_vnodes(const struct sd_vnode *entries, int nr_entries, uint64_t oid, int nr_copies, const struct sd_vnode **vnodes) { int idx, idxs[SD_MAX_COPIES], i; if (nr_entries == 0) return; idx = get_vnode_first_idx(entries, nr_entries, oid); idxs[0] = idx; vnodes[0] = &entries[idx]; for (i = 1; i < nr_copies; i++) { idx = get_vnode_next_idx(entries, nr_entries, idxs, i); idxs[i] = idx; vnodes[i] = &entries[idx]; } } static inline void oid_to_nodes(const struct sd_vnode *entries, int nr_entries, uint64_t oid, int nr_copies, const struct sd_node *all_nodes, const struct sd_node **nodes) { int i; const struct sd_vnode *vnodes[SD_MAX_COPIES]; oid_to_vnodes(entries, nr_entries, oid, nr_copies, vnodes); for (i = 0; i < nr_copies; i++) nodes[i] = &all_nodes[vnodes[i]->node_idx]; } static inline const char *sd_strerror(int err) { static const char *descs[256] = { /* from sheepdog_proto.h */ [SD_RES_SUCCESS] = "Success", [SD_RES_UNKNOWN] = "Unknown error", [SD_RES_NO_OBJ] = "No object found", [SD_RES_EIO] = "I/O error", [SD_RES_VDI_EXIST] = "VDI exists already", [SD_RES_INVALID_PARMS] = "Invalid parameters", [SD_RES_SYSTEM_ERROR] = "System error", [SD_RES_VDI_LOCKED] = "VDI is already locked", [SD_RES_NO_VDI] = "No VDI found", [SD_RES_NO_BASE_VDI] = "No base VDI found", [SD_RES_VDI_READ] = "Failed to read from requested VDI", [SD_RES_VDI_WRITE] = "Failed to write to requested VDI", [SD_RES_BASE_VDI_READ] = "Failed to read from base VDI", [SD_RES_BASE_VDI_WRITE] = "Failed to write to base VDI", [SD_RES_NO_TAG] = "Failed to find requested tag", [SD_RES_STARTUP] = "System is still booting", [SD_RES_VDI_NOT_LOCKED] = "VDI is not locked", [SD_RES_SHUTDOWN] = "System is shutting down", [SD_RES_NO_MEM] = "Out of memory on server", [SD_RES_FULL_VDI] = "Maximum number of VDIs reached", [SD_RES_VER_MISMATCH] = "Protocol version mismatch", [SD_RES_NO_SPACE] = "Server has no space for new objects", [SD_RES_WAIT_FOR_FORMAT] = "Waiting for cluster to be formatted", [SD_RES_WAIT_FOR_JOIN] = "Waiting for other nodes to join cluster", [SD_RES_JOIN_FAILED] = "Node has failed to join cluster", [SD_RES_HALT] = "IO has halted as there are no living nodes", [SD_RES_READONLY] = "Object is read-only", /* from internal_proto.h */ [SD_RES_OLD_NODE_VER] = "Request has an old epoch", [SD_RES_NEW_NODE_VER] = "Request has a new epoch", [SD_RES_NOT_FORMATTED] = "Cluster has not been formatted", [SD_RES_INVALID_CTIME] = "Creation times differ", [SD_RES_INVALID_EPOCH] = "Invalid epoch", [SD_RES_NETWORK_ERROR] = "Network error between sheep", [SD_RES_NO_CACHE] = "No cache object found", [SD_RES_BUFFER_SMALL] = "The buffer is too small", [SD_RES_FORCE_RECOVER] = "Cluster is running/halted and cannot be force recovered", [SD_RES_NO_STORE] = "Targeted backend store is not found", [SD_RES_NO_SUPPORT] = "Operation is not supported", [SD_RES_NODE_IN_RECOVERY] = "Targeted node is in recovery", [SD_RES_KILLED] = "Node is killed", [SD_RES_OID_EXIST] = "Object ID exists already", [SD_RES_AGAIN] = "Ask to try again", [SD_RES_STALE_OBJ] = "Object may be stale", [SD_RES_CLUSTER_ERROR] = "Cluster driver error", }; if (descs[err] == NULL) { static __thread char msg[32]; snprintf(msg, sizeof(msg), "Invalid error code %x", err); return msg; } return descs[err]; } static inline int node_id_cmp(const struct node_id *node1, const struct node_id *node2) { int cmp = memcmp(node1->addr, node2->addr, sizeof(node1->addr)); if (cmp != 0) return cmp; return intcmp(node1->port, node2->port); } static inline int node_cmp(const struct sd_node *node1, const struct sd_node *node2) { return node_id_cmp(&node1->nid, &node2->nid); } static inline bool node_eq(const struct sd_node *a, const struct sd_node *b) { return node_cmp(a, b) == 0; } static inline int vnode_cmp(const struct sd_vnode *node1, const struct sd_vnode *node2) { return intcmp(node1->id, node2->id); } static inline int nodes_to_vnodes(struct sd_node *nodes, int nr, struct sd_vnode *vnodes) { struct sd_node *n = nodes; int i, j, nr_vnodes = 0; uint64_t hval; while (nr--) { hval = FNV1A_64_INIT; for (i = 0; i < n->nr_vnodes; i++) { if (vnodes) { hval = fnv_64a_buf(&n->nid.port, sizeof(n->nid.port), hval); for (j = ARRAY_SIZE(n->nid.addr) - 1; j >= 0; j--) hval = fnv_64a_buf(&n->nid.addr[j], 1, hval); vnodes[nr_vnodes].id = hval; memcpy(vnodes[nr_vnodes].nid.addr, n->nid.addr, sizeof(n->nid.addr)); vnodes[nr_vnodes].nid.port = n->nid.port; vnodes[nr_vnodes].node_idx = n - nodes; vnodes[nr_vnodes].zone = n->zone; } nr_vnodes++; } n++; } if (vnodes) xqsort(vnodes, nr_vnodes, vnode_cmp); return nr_vnodes; } #define MAX_NODE_STR_LEN 256 static inline const char *node_to_str(const struct sd_node *id) { static __thread char str[MAX_NODE_STR_LEN]; int af = AF_INET6; const uint8_t *addr = id->nid.addr; /* Find address family type */ if (addr[12]) { int oct_no = 0; while (!addr[oct_no] && oct_no++ < 12) ; if (oct_no == 12) af = AF_INET; } snprintf(str, sizeof(str), "%s ip:%s port:%d", (af == AF_INET) ? "IPv4" : "IPv6", addr_to_str(id->nid.addr, 0), id->nid.port); return str; } static inline struct sd_node *str_to_node(const char *str, struct sd_node *id) { int port; char v[8], ip[MAX_NODE_STR_LEN]; sscanf(str, "%s ip:%s port:%d", v, ip, &port); id->nid.port = port; if (!str_to_addr(ip, id->nid.addr)) return NULL; return id; } #endif sheepdog-0.7.5/include/sheepdog_proto.h000066400000000000000000000207301223630776600201540ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __SHEEPDOG_PROTO_H__ #define __SHEEPDOG_PROTO_H__ #include #include #include #include #include "compiler.h" #define SD_PROTO_VER 0x02 /* This or later version supports trimming zero sectors from read response */ #define SD_PROTO_VER_TRIM_ZERO_SECTORS 0x02 #define SD_LISTEN_PORT 7000 #define SD_OP_CREATE_AND_WRITE_OBJ 0x01 #define SD_OP_READ_OBJ 0x02 #define SD_OP_WRITE_OBJ 0x03 #define SD_OP_REMOVE_OBJ 0x04 #define SD_OP_DISCARD_OBJ 0x05 #define SD_OP_NEW_VDI 0x11 #define SD_OP_LOCK_VDI 0x12 #define SD_OP_RELEASE_VDI 0x13 #define SD_OP_GET_VDI_INFO 0x14 #define SD_OP_READ_VDIS 0x15 #define SD_OP_FLUSH_VDI 0x16 #define SD_OP_DEL_VDI 0x17 #define SD_FLAG_CMD_WRITE 0x01 #define SD_FLAG_CMD_COW 0x02 #define SD_FLAG_CMD_CACHE 0x04 #define SD_FLAG_CMD_DIRECT 0x08 /* don't use object cache */ /* flags above 0x80 are sheepdog-internal */ #define SD_RES_SUCCESS 0x00 /* Success */ #define SD_RES_UNKNOWN 0x01 /* Unknown error */ #define SD_RES_NO_OBJ 0x02 /* No object found */ #define SD_RES_EIO 0x03 /* I/O error */ #define SD_RES_VDI_EXIST 0x04 /* VDI exists already */ #define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */ #define SD_RES_SYSTEM_ERROR 0x06 /* System error */ #define SD_RES_VDI_LOCKED 0x07 /* VDI is locked */ #define SD_RES_NO_VDI 0x08 /* No VDI found */ #define SD_RES_NO_BASE_VDI 0x09 /* No base VDI found */ #define SD_RES_VDI_READ 0x0A /* Cannot read requested VDI */ #define SD_RES_VDI_WRITE 0x0B /* Cannot write requested VDI */ #define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base VDI */ #define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base VDI */ #define SD_RES_NO_TAG 0x0E /* Requested tag is not found */ #define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */ #define SD_RES_VDI_NOT_LOCKED 0x10 /* VDI is not locked */ #define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */ #define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */ #define SD_RES_FULL_VDI 0x13 /* we already have the maximum VDIs */ #define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */ #define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */ #define SD_RES_WAIT_FOR_FORMAT 0x16 /* Sheepdog is waiting for a format operation */ #define SD_RES_WAIT_FOR_JOIN 0x17 /* Sheepdog is waiting for other nodes joining */ #define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */ #define SD_RES_HALT 0x19 /* Sheepdog is stopped doing IO */ #define SD_RES_READONLY 0x1A /* Object is read-only */ /* errors above 0x80 are sheepdog-internal */ /* * Object ID rules * * 0 - 19 (20 bits): data object space * 20 - 31 (12 bits): reserved data object space * 32 - 55 (24 bits): VDI object space * 56 - 59 ( 4 bits): reserved VDI object space * 60 - 63 ( 4 bits): object type indentifier space */ #define VDI_SPACE_SHIFT 32 #define SD_VDI_MASK 0x00FFFFFF00000000 #define VDI_BIT (UINT64_C(1) << 63) #define VMSTATE_BIT (UINT64_C(1) << 62) #define VDI_ATTR_BIT (UINT64_C(1) << 61) #define MAX_DATA_OBJS (1ULL << 20) #define MAX_CHILDREN 1024U #define SD_MAX_VDI_LEN 256U #define SD_MAX_VDI_TAG_LEN 256U #define SD_MAX_VDI_ATTR_KEY_LEN 256U #define SD_MAX_VDI_ATTR_VALUE_LEN 65536U #define SD_MAX_SNAPSHOT_TAG_LEN 256U #define SD_NR_VDIS (1U << 24) #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) #define SD_INODE_SIZE (sizeof(struct sd_inode)) #define SD_INODE_HEADER_SIZE (sizeof(struct sd_inode) - \ sizeof(uint32_t) * MAX_DATA_OBJS) #define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr)) #define CURRENT_VDI_ID 0 #define STORE_LEN 16 #define SD_REQ_SIZE 48 #define SD_RSP_SIZE 48 struct sd_req { uint8_t proto_ver; uint8_t opcode; uint16_t flags; uint32_t epoch; uint32_t id; uint32_t data_length; union { struct { uint64_t oid; uint64_t cow_oid; uint32_t copies; uint32_t tgt_epoch; uint64_t offset; } obj; struct { uint64_t vdi_size; uint32_t base_vdi_id; uint32_t copies; uint32_t snapid; } vdi; /* sheepdog-internal */ struct { uint64_t oid; uint64_t ctime; uint32_t copies; uint32_t tag; } cluster; struct { uint32_t old_vid; uint32_t new_vid; uint32_t copies; uint8_t set_bitmap; /* 0 means false */ /* others mean true */ } vdi_state; uint32_t __pad[8]; }; }; struct sd_rsp { uint8_t proto_ver; uint8_t opcode; uint16_t flags; uint32_t epoch; uint32_t id; uint32_t data_length; union { uint32_t result; struct { uint32_t __pad; uint32_t copies; uint64_t offset; } obj; struct { uint32_t __pad; uint32_t rsvd; uint32_t vdi_id; uint32_t attr_id; uint32_t copies; } vdi; /* sheepdog-internal */ struct { uint32_t __pad; uint32_t nr_nodes; uint32_t local_idx; uint32_t __reserved; uint64_t store_size; uint64_t store_free; } node; struct { uint32_t __pad1; uint32_t __pad2; uint8_t digest[20]; } hash; uint32_t __pad[8]; }; }; struct sd_inode { char name[SD_MAX_VDI_LEN]; char tag[SD_MAX_VDI_TAG_LEN]; uint64_t create_time; uint64_t snap_ctime; uint64_t vm_clock_nsec; uint64_t vdi_size; uint64_t vm_state_size; uint16_t copy_policy; uint8_t nr_copies; uint8_t block_size_shift; uint32_t snap_id; uint32_t vdi_id; uint32_t parent_vdi_id; uint32_t child_vdi_id[MAX_CHILDREN]; uint32_t data_vdi_id[MAX_DATA_OBJS]; }; struct sheepdog_vdi_attr { char name[SD_MAX_VDI_LEN]; char tag[SD_MAX_VDI_TAG_LEN]; uint64_t ctime; uint32_t snap_id; uint32_t value_len; char key[SD_MAX_VDI_ATTR_KEY_LEN]; char value[SD_MAX_VDI_ATTR_VALUE_LEN]; }; /* 64 bit FNV-1a non-zero initial basis */ #define FNV1A_64_INIT ((uint64_t) 0xcbf29ce484222325ULL) /* 64 bit Fowler/Noll/Vo FNV-1a hash code */ static inline uint64_t fnv_64a_buf(const void *buf, size_t len, uint64_t hval) { unsigned char *bp = (unsigned char *) buf; unsigned char *be = bp + len; while (bp < be) { hval ^= (uint64_t) *bp++; hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + (hval << 8) + (hval << 40); } return hval; } static inline uint64_t hash_64(uint64_t val, unsigned int bits) { uint64_t hash = fnv_64a_buf(&val, sizeof(uint64_t), FNV1A_64_INIT); return hash & ((1 << bits) - 1); } static inline bool is_data_obj_writeable(const struct sd_inode *inode, int idx) { return inode->vdi_id == inode->data_vdi_id[idx]; } static inline bool is_vdi_obj(uint64_t oid) { return !!(oid & VDI_BIT); } static inline bool is_vmstate_obj(uint64_t oid) { return !!(oid & VMSTATE_BIT); } static inline bool is_vdi_attr_obj(uint64_t oid) { return !!(oid & VDI_ATTR_BIT); } static inline bool is_data_obj(uint64_t oid) { return !is_vdi_obj(oid) && !is_vmstate_obj(oid) && !is_vdi_attr_obj(oid); } static inline size_t get_objsize(uint64_t oid) { if (is_vdi_obj(oid)) return SD_INODE_SIZE; if (is_vdi_attr_obj(oid)) return SD_ATTR_OBJ_SIZE; return SD_DATA_OBJ_SIZE; } static inline uint64_t data_oid_to_idx(uint64_t oid) { return oid & (MAX_DATA_OBJS - 1); } static inline uint64_t vid_to_vdi_oid(uint32_t vid) { return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT); } static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx) { return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; } static inline uint32_t oid_to_vid(uint64_t oid) { return (oid & SD_VDI_MASK) >> VDI_SPACE_SHIFT; } static inline uint64_t vid_to_attr_oid(uint32_t vid, uint32_t attrid) { return ((uint64_t)vid << VDI_SPACE_SHIFT) | VDI_ATTR_BIT | attrid; } static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx) { return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; } static inline bool vdi_is_snapshot(const struct sd_inode *inode) { return !!inode->snap_ctime; } static inline __attribute__((used)) void __sd_proto_build_bug_ons(void) { /* never called, only for checking BUILD_BUG_ON()s */ BUILD_BUG_ON(sizeof(struct sd_req) != SD_REQ_SIZE); BUILD_BUG_ON(sizeof(struct sd_rsp) != SD_RSP_SIZE); } #endif sheepdog-0.7.5/include/shepherd.h000066400000000000000000000053501223630776600167360ustar00rootroot00000000000000#ifndef SHEPHERD_H #define SHEPHERD_H enum sph_cli_msg_type { /* messages sent by a cluster driver, received by shepherd */ SPH_CLI_MSG_JOIN = 0, SPH_CLI_MSG_ACCEPT, SPH_CLI_MSG_NOTIFY, SPH_CLI_MSG_BLOCK, SPH_CLI_MSG_LEAVE, }; enum sph_srv_msg_type { /* messages sent by shepherd, received by a cluster driver */ SPH_SRV_MSG_JOIN_REPLY = 0, SPH_SRV_MSG_JOIN_RETRY, SPH_SRV_MSG_NEW_NODE, SPH_SRV_MSG_NEW_NODE_FINISH, SPH_SRV_MSG_NOTIFY_FORWARD, SPH_SRV_MSG_BLOCK_FORWARD, SPH_SRV_MSG_LEAVE_FORWARD, SPH_SRV_MSG_REMOVE, }; struct sph_msg { /* * original type of uint32_t type: * enum sph_cli_msg_type or enum sph_srv_msg_type */ uint32_t type; uint32_t body_len; }; #include "internal_proto.h" struct sph_msg_join { struct sd_node new_node; struct sd_node nodes[SD_MAX_NODES]; uint32_t nr_nodes; uint8_t opaque[0]; }; struct sph_msg_join_reply { struct sd_node nodes[SD_MAX_NODES]; uint32_t nr_nodes; uint8_t opaque[0]; }; struct sph_msg_join_node_finish { struct sd_node new_node; struct sd_node nodes[SD_MAX_NODES]; uint32_t nr_nodes; uint8_t opaque[0]; }; struct sph_msg_notify { uint8_t unblock; uint8_t notify_msg[0]; }; struct sph_msg_notify_forward { struct sd_node from_node; uint8_t unblock; uint8_t notify_msg[0]; }; #define SHEPHERD_PORT 2501 static inline const char *sph_cli_msg_to_str(enum sph_cli_msg_type msg) /* CAUTION: non reentrant */ { int i; static char unknown[64]; static const struct { enum sph_cli_msg_type msg; const char *desc; } msgs[] = { { SPH_CLI_MSG_JOIN, "SPH_CLI_MSG_JOIN" }, { SPH_CLI_MSG_ACCEPT, "SPH_CLI_MSG_ACCEPT" }, { SPH_CLI_MSG_NOTIFY, "SPH_CLI_MSG_NOTIFY" }, { SPH_CLI_MSG_BLOCK, "SPH_CLI_MSG_BLOCK" }, { SPH_CLI_MSG_LEAVE, "SPH_CLI_MSG_LEAVE" }, }; for (i = 0; i < ARRAY_SIZE(msgs); i++) { if (msgs[i].msg == msg) return msgs[i].desc; } memset(unknown, 0, 64); snprintf(unknown, 64, "", msg); return unknown; } static inline const char *sph_srv_msg_to_str(enum sph_srv_msg_type msg) /* CAUTION: non reentrant */ { int i; static char unknown[64]; static const struct { enum sph_srv_msg_type msg; const char *desc; } msgs[] = { { SPH_SRV_MSG_JOIN_RETRY, "SPH_SRV_MSG_JOIN_RETRY" }, { SPH_SRV_MSG_NEW_NODE, "SPH_SRV_MSG_NEW_NODE" }, { SPH_SRV_MSG_NEW_NODE_FINISH, "SPH_SRV_MSG_NEW_NODE_FINISH" }, { SPH_SRV_MSG_NOTIFY_FORWARD, "SPH_SRV_MSG_NOTIFY_FORWARD" }, { SPH_SRV_MSG_BLOCK_FORWARD, "SPH_SRV_MSG_BLOCK_FORWARD" }, { SPH_SRV_MSG_REMOVE, "SPH_SRV_MSG_REMOVE" }, }; for (i = 0; i < ARRAY_SIZE(msgs); i++) { if (msgs[i].msg == msg) return msgs[i].desc; } memset(unknown, 0, 64); snprintf(unknown, 64, "", msg); return unknown; } #endif /* SHEPHERD_H */ sheepdog-0.7.5/include/sockfd_cache.h000066400000000000000000000011001223630776600175150ustar00rootroot00000000000000#ifndef SOCKFD_CACHE_H #define SOCKFD_CACHE_H #include "internal_proto.h" #include "work.h" struct sockfd *sockfd_cache_get(const struct node_id *nid); void sockfd_cache_put(const struct node_id *nid, struct sockfd *sfd); void sockfd_cache_del_node(const struct node_id *nid); void sockfd_cache_del(const struct node_id *nid, struct sockfd *sfd); void sockfd_cache_add(const struct node_id *nid); void sockfd_cache_add_group(const struct sd_node *nodes, int nr); int sockfd_init(void); /* sockfd_cache */ struct sockfd { int fd; int idx; }; #endif /* SOCKFD_CACHE_H */ sheepdog-0.7.5/include/strbuf.h000066400000000000000000000051201223630776600164340ustar00rootroot00000000000000#ifndef STRBUF_H #define STRBUF_H #include #include #include #include #include #include "util.h" struct strbuf { size_t alloc; size_t len; int eof; char *buf; }; #define alloc_nr(x) (((x)+16)*3/2) /* * Realloc the buffer pointed at by variable 'x' so that it can hold * at least 'nr' entries; the number of entries currently allocated * is 'alloc', using the standard growing factor alloc_nr() macro. * * DO NOT USE any expression with side-effect for 'x' or 'alloc'. */ #define ALLOC_GROW(x, nr, alloc) \ do { \ if ((nr) > alloc) { \ if (alloc_nr(alloc) < (nr)) \ alloc = (nr); \ else \ alloc = alloc_nr(alloc); \ x = xrealloc((x), alloc * sizeof(*(x))); \ } \ } while (0) #define STRBUF_INIT { 0, 0, 0, NULL } /*----- strbuf life cycle -----*/ void strbuf_init(struct strbuf *, size_t); void strbuf_release(struct strbuf *); void strbuf_reset(struct strbuf *); char *strbuf_detach(struct strbuf *); void strbuf_attach(struct strbuf *, void *, size_t, size_t); /*----- strbuf size related -----*/ static inline size_t strbuf_avail(struct strbuf *sb) { return sb->alloc ? sb->alloc - sb->len - 1 : 0; } static inline void strbuf_setlen(struct strbuf *sb, size_t len) { assert(len < sb->alloc); sb->len = len; sb->buf[len] = '\0'; } void strbuf_grow(struct strbuf *, size_t); /*----- content related -----*/ void strbuf_rtrim(struct strbuf *); /*----- add data in your buffer -----*/ static inline void strbuf_addch(struct strbuf *sb, int c) { strbuf_grow(sb, 1); sb->buf[sb->len++] = c; sb->buf[sb->len] = '\0'; } /* inserts after pos, or appends if pos >= sb->len */ void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t); void strbuf_remove(struct strbuf *, size_t pos, size_t len); /* splice pos..pos+len with given data */ void strbuf_splice(struct strbuf *, size_t pos, size_t len, const void *, size_t); void strbuf_add(struct strbuf *, const void *, size_t); static inline void strbuf_addstr(struct strbuf *sb, const char *s) { strbuf_add(sb, s, strlen(s)); } static inline void strbuf_addbuf(struct strbuf *sb, struct strbuf *sb2) { strbuf_add(sb, sb2->buf, sb2->len); } void strbuf_addf(struct strbuf *sb, const char *fmt, ...) __printf(2, 3); size_t strbuf_fread(struct strbuf *, size_t, FILE *); /* XXX: if read fails, any partial read is undone */ ssize_t strbuf_read(struct strbuf *, int fd, size_t hint); int strbuf_getline(struct strbuf *sb, FILE *fp, int term); int strbuf_copyout(struct strbuf *sb, void *buf, size_t len); int strbuf_stripout(struct strbuf *sb, void *buf, size_t len); #endif sheepdog-0.7.5/include/util.h000066400000000000000000000205141223630776600161100ustar00rootroot00000000000000#ifndef __UTIL_H__ #define __UTIL_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include "logger.h" #include "list.h" #include "compiler.h" #define SECTOR_SIZE (1U << 9) #define BLOCK_SIZE (1U << 12) #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define round_up(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) #define round_down(x, y) (((x) / (y)) * (y)) #if __BYTE_ORDER == __LITTLE_ENDIAN #define __cpu_to_be16(x) bswap_16(x) #define __cpu_to_be32(x) bswap_32(x) #define __cpu_to_be64(x) bswap_64(x) #define __be16_to_cpu(x) bswap_16(x) #define __be32_to_cpu(x) bswap_32(x) #define __be64_to_cpu(x) bswap_64(x) #define __cpu_to_le32(x) (x) #else #define __cpu_to_be16(x) (x) #define __cpu_to_be32(x) (x) #define __cpu_to_be64(x) (x) #define __be16_to_cpu(x) (x) #define __be32_to_cpu(x) (x) #define __be64_to_cpu(x) (x) #define __cpu_to_le32(x) bswap_32(x) #endif #define uninitialized_var(x) x = x static inline int before(uint32_t seq1, uint32_t seq2) { return (int32_t)(seq1 - seq2) < 0; } static inline int after(uint32_t seq1, uint32_t seq2) { return (int32_t)(seq2 - seq1) < 0; } #define min(x, y) ({ \ typeof(x) _x = (x); \ typeof(y) _y = (y); \ (void) (&_x == &_y); \ _x < _y ? _x : _y; }) #define max(x, y) ({ \ typeof(x) _x = (x); \ typeof(y) _y = (y); \ (void) (&_x == &_y); \ _x > _y ? _x : _y; }) static inline void *zalloc(size_t size) { return calloc(1, size); } /* * Compares two integer values * * If the first argument is larger than the second one, intcmp() returns 1. If * two members are equal, returns 0. Otherwise, returns -1. */ #define intcmp(x, y) \ ({ \ typeof(x) _x = (x); \ typeof(y) _y = (y); \ (void) (&_x == &_y); \ _x < _y ? -1 : _x > _y ? 1 : 0; \ }) typedef void (*try_to_free_t)(size_t); try_to_free_t set_try_to_free_routine(try_to_free_t); void *xmalloc(size_t size); void *xzalloc(size_t size); void *xrealloc(void *ptr, size_t size); void *xcalloc(size_t nmemb, size_t size); void *xvalloc(size_t size); ssize_t xread(int fd, void *buf, size_t len); ssize_t xwrite(int fd, const void *buf, size_t len); ssize_t xpread(int fd, void *buf, size_t count, off_t offset); ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset); int xmkdir(const char *pathname, mode_t mode); int xfallocate(int fd, int mode, off_t offset, off_t len); int xftruncate(int fd, off_t length); int eventfd_xread(int efd); void eventfd_xwrite(int efd, int value); void pstrcpy(char *buf, int buf_size, const char *str); int rmdir_r(char *dir_path); int purge_directory(char *dir_path); bool is_numeric(const char *p); int install_sighandler(int signum, void (*handler)(int), bool once); int install_crash_handler(void (*handler)(int)); void reraise_crash_signal(int signo, int status); pid_t gettid(void); int tkill(int tid, int sig); bool is_xattr_enabled(const char *path); void find_zero_blocks(const void *buf, uint64_t *poffset, uint32_t *plen); void trim_zero_blocks(void *buf, uint64_t *offset, uint32_t *len); void untrim_zero_blocks(void *buf, uint64_t offset, uint32_t len, uint32_t requested_len); int atomic_create_and_write(const char *path, char *buf, size_t len, bool force_create); /* a type safe version of qsort() */ #define xqsort(base, nmemb, compar) \ ({ \ if (nmemb > 1) { \ qsort(base, nmemb, sizeof(*(base)), \ (comparison_fn_t)compar); \ assert(compar(base, base + 1) <= 0); \ } \ }) /* a type safe version of bsearch() */ #define xbsearch(key, base, nmemb, compar) \ ({ \ typeof(&(base)[0]) __ret = NULL; \ if (nmemb > 0) { \ assert(compar(key, key) == 0); \ assert(compar(base, base) == 0); \ __ret = bsearch(key, base, nmemb, sizeof(*(base)), \ (comparison_fn_t)compar); \ } \ __ret; \ }) /* a type safe version of lfind() */ #define xlfind(key, base, nmemb, compar) \ ({ \ typeof(&(base)[0]) __ret = NULL; \ if (nmemb > 0) { \ size_t __n = nmemb; \ assert(compar(key, key) == 0); \ assert(compar(base, base) == 0); \ __ret = lfind(key, base, &__n, sizeof(*(base)), \ (comparison_fn_t)compar); \ } \ __ret; \ }) /* * Search 'key' in the array 'base' linearly and remove it if it found. * * If 'key' is found in 'base', this function increments *nmemb and returns * true. */ #define xlremove(key, base, nmemb, compar) \ ({ \ bool __removed = false; \ typeof(&(base)[0]) __e; \ \ __e = xlfind(key, base, *(nmemb), compar); \ if (__e != NULL) { \ (*(nmemb))--; \ memmove(__e, __e + 1, \ sizeof(*(base)) * (*(nmemb) - (__e - (base)))); \ __removed = true; \ } \ __removed; \ }) #ifdef assert #error "Don't include assert.h, use util.h for assert()" #endif #ifndef NDEBUG #define assert(expr) \ ({ \ if (!(expr)) { \ sd_emerg("Asserting `%s' failed.", #expr); \ abort(); \ } \ }) #else #define assert(expr) ((void)0) #endif /* NDEBUG */ /* urcu helpers */ /* Boolean data type which can be accessed by multiple threads */ typedef struct { unsigned long val; } uatomic_bool; static inline bool uatomic_is_true(uatomic_bool *val) { return uatomic_read(&val->val) == 1; } /* success if the old value is false */ static inline bool uatomic_set_true(uatomic_bool *val) { return uatomic_cmpxchg(&val->val, 0, 1) == 0; } static inline void uatomic_set_false(uatomic_bool *val) { uatomic_set(&val->val, 0); } /* * uatomic_xchg_ptr - uatomic_xchg for pointers * * Swaps the old value stored at location p with new value given by * val. Returns old value. */ #define uatomic_xchg_ptr(p, val) \ ({ \ uintptr_t ret; \ ret = uatomic_xchg((uintptr_t *)(p), (val)); \ (typeof(*(p)))ret; \ }) /* * refcnt_t: reference counter which can be manipulated by multiple threads * safely */ typedef struct { int val; } refcnt_t; static inline void refcount_set(refcnt_t *rc, int val) { uatomic_set(&rc->val, val); } static inline int refcount_read(refcnt_t *rc) { return uatomic_read(&rc->val); } static inline int refcount_inc(refcnt_t *rc) { return uatomic_add_return(&rc->val, 1); } static inline int refcount_dec(refcnt_t *rc) { assert(1 <= uatomic_read(&rc->val)); return uatomic_sub_return(&rc->val, 1); } /* wrapper for pthread_rwlock */ #define SD_LOCK_INITIALIZER { .rwlock = PTHREAD_RWLOCK_INITIALIZER } struct sd_lock { pthread_rwlock_t rwlock; }; static inline void sd_init_lock(struct sd_lock *lock) { int ret; do { ret = pthread_rwlock_init(&lock->rwlock, NULL); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to initialize a lock, %s", strerror(ret)); } static inline void sd_destroy_lock(struct sd_lock *lock) { int ret = pthread_rwlock_destroy(&lock->rwlock); if (unlikely(ret != 0)) panic("failed to destroy a lock, %s", strerror(ret)); } static inline void sd_read_lock(struct sd_lock *lock) { int ret; do { ret = pthread_rwlock_rdlock(&lock->rwlock); } while (ret == EAGAIN); if (unlikely(ret != 0)) panic("failed to lock for reading, %s", strerror(ret)); } static inline void sd_write_lock(struct sd_lock *lock) { int ret = pthread_rwlock_wrlock(&lock->rwlock); if (unlikely(ret != 0)) panic("failed to lock for writing, %s", strerror(ret)); } static inline void sd_unlock(struct sd_lock *lock) { int ret = pthread_rwlock_unlock(&lock->rwlock); if (unlikely(ret != 0)) panic("failed to unlock, %s", strerror(ret)); } /* colors */ #define TEXT_NORMAL "\033[0m" #define TEXT_BOLD "\033[1m" #define TEXT_RED "\033[0;31m" #define TEXT_BOLD_RED "\033[1;31m" #define TEXT_GREEN "\033[0;32m" #define TEXT_BOLD_GREEN "\033[1;32m" #define TEXT_YELLOW "\033[0;33m" #define TEXT_BOLD_YELLOW "\033[1;33m" #define TEXT_BLUE "\033[0;34m" #define TEXT_BOLD_BLUE "\033[1;34m" #define TEXT_MAGENTA "\033[0;35m" #define TEXT_BOLD_MAGENTA "\033[1;35m" #define TEXT_CYAN "\033[0;36m" #define TEXT_BOLD_CYAN "\033[1;36m" static inline bool is_stdin_console(void) { return isatty(STDIN_FILENO); } static inline bool is_stdout_console(void) { return isatty(STDOUT_FILENO); } extern mode_t sd_def_fmode; extern mode_t sd_def_dmode; #endif sheepdog-0.7.5/include/work.h000066400000000000000000000032011223630776600161070ustar00rootroot00000000000000#ifndef __WORK_H__ #define __WORK_H__ #include #include "list.h" #include "util.h" struct work; typedef void (*work_func_t)(struct work *); struct work { struct list_head w_list; work_func_t fn; work_func_t done; }; struct work_queue { int wq_state; struct list_head pending_list; }; enum wq_thread_control { WQ_ORDERED, /* Only 1 thread created for work queue */ WQ_DYNAMIC, /* # of threads proportional to nr_nodes created */ WQ_UNLIMITED, /* Unlimited # of threads created */ }; static inline bool is_main_thread(void) { return gettid() == getpid(); } static inline bool is_worker_thread(void) { return !is_main_thread(); } /* * Helper macros to guard variables from being accessed out of the * main thread. Note that we can use these only for pointers. */ #define main_thread(type) struct { type __val; } #define main_thread_get(var) \ ({ \ assert(is_main_thread()); \ (var).__val; \ }) #define main_thread_set(var, val) \ ({ \ assert(is_main_thread()); \ (var).__val = (val); \ }) /* * 'get_nr_nodes' is the function to get the current number of nodes and used * for dynamic work queues. 'create_cb' will be called when worker threads are * created and 'destroy_cb' will be called when worker threads are destroyed. */ int init_work_queue(size_t (*get_nr_nodes)(void)); struct work_queue *create_work_queue(const char *name, enum wq_thread_control); struct work_queue *create_ordered_work_queue(const char *name); void suspend_worker_threads(void); void resume_worker_threads(void); void queue_work(struct work_queue *q, struct work *work); bool work_queue_empty(struct work_queue *q); #endif sheepdog-0.7.5/lib/000077500000000000000000000000001223630776600141035ustar00rootroot00000000000000sheepdog-0.7.5/lib/Makefile.am000066400000000000000000000006301223630776600161360ustar00rootroot00000000000000MAINTAINERCLEANFILES = Makefile.in AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include noinst_LIBRARIES = libsheepdog.a libsheepdog_a_SOURCES = event.c logger.c net.c util.c rbtree.c strbuf.c \ sha1.c option.c work.c sockfd_cache.c # support for GNU Flymake check-syntax: $(COMPILE) -fsyntax-only $(CHK_SOURCES) check-style: @$(CHECK_STYLE) $(libsheepdog_a_SOURCES) sheepdog-0.7.5/lib/event.c000066400000000000000000000104271223630776600153740ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include "list.h" #include "util.h" #include "event.h" static int efd; static LIST_HEAD(events_list); static void timer_handler(int fd, int events, void *data) { struct timer *t = data; uint64_t val; if (read(fd, &val, sizeof(val)) < 0) return; t->callback(t->data); unregister_event(fd); close(fd); } void add_timer(struct timer *t, unsigned int mseconds) { struct itimerspec it; int tfd; tfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); if (tfd < 0) { sd_err("timerfd_create: %m"); return; } memset(&it, 0, sizeof(it)); it.it_value.tv_sec = mseconds / 1000; it.it_value.tv_nsec = (mseconds % 1000) * 1000000; if (timerfd_settime(tfd, 0, &it, NULL) < 0) { sd_err("timerfd_settime: %m"); return; } if (register_event(tfd, timer_handler, t) < 0) sd_err("failed to register timer fd"); } struct event_info { event_handler_t handler; int fd; void *data; struct list_head ei_list; int prio; }; static struct epoll_event *events; static int nr_events; int init_event(int nr) { nr_events = nr; events = xcalloc(nr_events, sizeof(struct epoll_event)); efd = epoll_create(nr); if (efd < 0) { sd_err("failed to create epoll fd"); return -1; } return 0; } static struct event_info *lookup_event(int fd) { struct event_info *ei; list_for_each_entry(ei, &events_list, ei_list) { if (ei->fd == fd) return ei; } return NULL; } int register_event_prio(int fd, event_handler_t h, void *data, int prio) { int ret; struct epoll_event ev; struct event_info *ei; ei = xzalloc(sizeof(*ei)); ei->fd = fd; ei->handler = h; ei->data = data; ei->prio = prio; memset(&ev, 0, sizeof(ev)); ev.events = EPOLLIN; ev.data.ptr = ei; ret = epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev); if (ret) { sd_err("failed to add epoll event: %m"); free(ei); } else list_add(&ei->ei_list, &events_list); return ret; } void unregister_event(int fd) { int ret; struct event_info *ei; ei = lookup_event(fd); if (!ei) return; ret = epoll_ctl(efd, EPOLL_CTL_DEL, fd, NULL); if (ret) sd_err("failed to delete epoll event for fd %d: %m", fd); list_del(&ei->ei_list); free(ei); /* * Although ei is no longer valid pointer, ei->handler() might be about * to be called in do_event_loop(). Refreshing the event loop is safe. */ event_force_refresh(); } int modify_event(int fd, unsigned int new_events) { int ret; struct epoll_event ev; struct event_info *ei; ei = lookup_event(fd); if (!ei) { sd_err("event info for fd %d not found", fd); return 1; } memset(&ev, 0, sizeof(ev)); ev.events = new_events; ev.data.ptr = ei; ret = epoll_ctl(efd, EPOLL_CTL_MOD, fd, &ev); if (ret) { sd_err("failed to delete epoll event for fd %d: %m", fd); return 1; } return 0; } static bool event_loop_refresh; void event_force_refresh(void) { event_loop_refresh = true; } static int epoll_event_cmp(const struct epoll_event *_a, struct epoll_event *_b) { struct event_info *a, *b; a = (struct event_info *)_a->data.ptr; b = (struct event_info *)_b->data.ptr; /* we need sort event_info array in reverse order */ return intcmp(b->prio, a->prio); } static void do_event_loop(int timeout, bool sort_with_prio) { int i, nr; refresh: event_loop_refresh = false; nr = epoll_wait(efd, events, nr_events, timeout); if (sort_with_prio) xqsort(events, nr, epoll_event_cmp); if (nr < 0) { if (errno == EINTR) return; sd_err("epoll_wait failed: %m"); exit(1); } else if (nr) { for (i = 0; i < nr; i++) { struct event_info *ei; ei = (struct event_info *)events[i].data.ptr; ei->handler(ei->fd, events[i].events, ei->data); if (event_loop_refresh) goto refresh; } } } void event_loop(int timeout) { do_event_loop(timeout, false); } void event_loop_prio(int timeout) { do_event_loop(timeout, true); } sheepdog-0.7.5/lib/logger.c000066400000000000000000000453171223630776600155400ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * This code is based on log.c from Linux target framework (tgt): * Copyright (C) 2002-2003 Ardis Technolgies */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" static bool colorize; static const char * const log_color[] = { [SDOG_EMERG] = TEXT_BOLD_RED, [SDOG_ALERT] = TEXT_BOLD_RED, [SDOG_CRIT] = TEXT_BOLD_RED, [SDOG_ERR] = TEXT_BOLD_RED, [SDOG_WARNING] = TEXT_BOLD_YELLOW, [SDOG_NOTICE] = TEXT_BOLD_CYAN, [SDOG_INFO] = TEXT_CYAN, [SDOG_DEBUG] = TEXT_GREEN, }; static const char * const log_prio_str[] = { [SDOG_EMERG] = "EMERG", [SDOG_ALERT] = "ALERT", [SDOG_CRIT] = "CRIT", [SDOG_ERR] = "ERROR", [SDOG_WARNING] = "WARN", [SDOG_NOTICE] = "NOTICE", [SDOG_INFO] = "INFO", [SDOG_DEBUG] = "DEBUG", }; static struct logger_user_info *logger_user_info; static void dolog(int prio, const char *func, int line, const char *fmt, va_list ap) __printf(4, 0); union semun { int val; struct semid_ds *buf; unsigned short int *array; struct seminfo *__buf; }; struct logarea { bool active; char *tail; char *start; char *end; int semid; union semun semarg; int fd; }; #define FUNC_NAME_SIZE 32 /* according to C89, including '\0' */ struct logmsg { struct timeval tv; int prio; char func[FUNC_NAME_SIZE]; int line; char worker_name[MAX_THREAD_NAME_LEN]; int worker_idx; size_t str_len; char str[0]; }; struct log_format { const char *name; int (*formatter)(char *, size_t, const struct logmsg *); struct list_head list; }; #define log_format_register(n, formatter_fn) \ static void __attribute__((constructor(101))) \ regist_ ## formatter_fn(void) { \ static struct log_format f = \ { .name = n, .formatter = formatter_fn }; \ list_add(&f.list, &log_formats); \ } static LIST_HEAD(log_formats); static struct log_format *format; static int log_fd = -1; static __thread const char *worker_name; static __thread int worker_idx; static struct logarea *la; static const char *log_name; static char *log_nowname; int sd_log_level = SDOG_INFO; static pid_t sheep_pid; static pid_t logger_pid; static key_t semkey; static char *log_buff; static int64_t max_logsize = 500 * 1024 * 1024; /*500MB*/ static pthread_mutex_t logsize_lock = PTHREAD_MUTEX_INITIALIZER; static const char *format_thread_name(char *str, size_t size, const char *name, int idx) { if (name && name[0] && idx) snprintf(str, size, "%s %d", name, idx); else if (name && name[0]) snprintf(str, size, "%s", name); else snprintf(str, size, "main"); return str; } /* * We need to set default log formatter because dog doesn't want to call * select_log_formatter(). */ static void __attribute__((constructor(65535))) init_log_formatter(void) { struct log_format *f; list_for_each_entry(f, &log_formats, list) { if (!strcmp(f->name, "default")) { format = f; return; } } syslog(LOG_ERR, "failed to set default formatter\n"); exit(1); } static int logarea_init(int size) { int shmid; shmid = shmget(IPC_PRIVATE, sizeof(struct logarea), 0644 | IPC_CREAT | IPC_EXCL); if (shmid == -1) { syslog(LOG_ERR, "shmget logarea failed: %m"); return 1; } la = shmat(shmid, NULL, 0); if (!la) { syslog(LOG_ERR, "shmat logarea failed: %m"); return 1; } shmctl(shmid, IPC_RMID, NULL); if (size < MAX_MSG_SIZE) size = LOG_SPACE_SIZE; shmid = shmget(IPC_PRIVATE, size, 0644 | IPC_CREAT | IPC_EXCL); if (shmid == -1) { syslog(LOG_ERR, "shmget msg failed: %m"); shmdt(la); return 1; } la->start = shmat(shmid, NULL, 0); if (!la->start) { syslog(LOG_ERR, "shmat msg failed: %m"); shmdt(la); return 1; } memset(la->start, 0, size); shmctl(shmid, IPC_RMID, NULL); la->end = la->start + size; la->tail = la->start; la->semid = semget(semkey, 1, 0666 | IPC_CREAT); if (la->semid < 0) { syslog(LOG_ERR, "semget failed: %m"); shmdt(la->start); shmdt(la); return 1; } la->semarg.val = 1; if (semctl(la->semid, 0, SETVAL, la->semarg) < 0) { syslog(LOG_ERR, "semctl failed: %m"); shmdt(la->start); shmdt(la); return 1; } return 0; } static void free_logarea(void) { if (log_fd >= 0) close(log_fd); semctl(la->semid, 0, IPC_RMID, la->semarg); shmdt(la->start); shmdt(la); } static int server_log_formatter(char *buff, size_t size, const struct logmsg *msg) { char *p = buff; struct tm tm; size_t len; char thread_name[MAX_THREAD_NAME_LEN]; localtime_r(&msg->tv.tv_sec, &tm); len = strftime(p, size, "%b %2d %H:%M:%S ", (const struct tm *)&tm); p += len; size -= len; len = snprintf(p, size, "%s%6s %s[%s] %s(%d) %s%s%s", colorize ? log_color[msg->prio] : "", log_prio_str[msg->prio], colorize ? TEXT_YELLOW : "", format_thread_name(thread_name, sizeof(thread_name), msg->worker_name, msg->worker_idx), msg->func, msg->line, colorize ? log_color[msg->prio] : "", msg->str, colorize ? TEXT_NORMAL : ""); if (len < 0) len = 0; p += min(len, size - 1); return p - buff; } log_format_register("server", server_log_formatter); static int default_log_formatter(char *buff, size_t size, const struct logmsg *msg) { size_t len = min(size, msg->str_len); memcpy(buff, msg->str, len); return len; } log_format_register("default", default_log_formatter); static int json_log_formatter(char *buff, size_t size, const struct logmsg *msg) { char *p = buff; size_t len; assert(logger_user_info); len = snprintf(p, size, "{ \"user_info\": " "{\"program_name\": \"%s\", \"port\": %d}," "\"body\": {" "\"second\": %lu, \"usecond\": %lu, " "\"worker_name\": \"%s\", \"worker_idx\": %d, " "\"func\": \"%s\", \"line\": %d, " "\"msg\": \"", log_name, logger_user_info->port, msg->tv.tv_sec, msg->tv.tv_usec, msg->worker_name[0] ? msg->worker_name : "main", msg->worker_idx, msg->func, msg->line); if (len < 0) return 0; len = min(len, size - 1); p += len; size -= len; for (int i = 0; i < msg->str_len; i++) { if (size <= 1) break; if (msg->str[i] == '"') { *p++ = '\\'; size--; } if (size <= 1) break; *p++ = msg->str[i]; size--; } pstrcpy(p, size, "\"} }"); p += strlen(p); return p - buff; } log_format_register("json", json_log_formatter); /* this one can block under memory pressure */ static void log_syslog(const struct logmsg *msg) { char str[MAX_MSG_SIZE]; int len; len = format->formatter(str, sizeof(str) - 1, msg); str[len++] = '\n'; if (log_fd >= 0) xwrite(log_fd, str, len); else syslog(msg->prio, "%s", str); } static void init_logmsg(struct logmsg *msg, struct timeval *tv, int prio, const char *func, int line) { msg->tv = *tv; msg->prio = prio; pstrcpy(msg->func, FUNC_NAME_SIZE, func); msg->line = line; if (worker_name) pstrcpy(msg->worker_name, MAX_THREAD_NAME_LEN, worker_name); else msg->worker_name[0] = '\0'; msg->worker_idx = worker_idx; } static void dolog(int prio, const char *func, int line, const char *fmt, va_list ap) { char buf[sizeof(struct logmsg) + MAX_MSG_SIZE]; char *str = buf + sizeof(struct logmsg); struct logmsg *msg = (struct logmsg *)buf; int len = 0; struct timeval tv; gettimeofday(&tv, NULL); len = vsnprintf(str, MAX_MSG_SIZE, fmt, ap); if (len < 0) { syslog(LOG_ERR, "vsnprintf failed"); return; } msg->str_len = min(len, MAX_MSG_SIZE - 1); if (la) { struct sembuf ops; ops.sem_num = 0; ops.sem_flg = SEM_UNDO; ops.sem_op = -1; if (semop(la->semid, &ops, 1) < 0) { syslog(LOG_ERR, "semop up failed: %m"); return; } /* not enough space: drop msg */ if (len + sizeof(struct logmsg) + 1 > la->end - la->tail) syslog(LOG_ERR, "enqueue: log area overrun, " "dropping message\n"); else { /* ok, we can stage the msg in the area */ msg = (struct logmsg *)la->tail; init_logmsg(msg, &tv, prio, func, line); memcpy(msg->str, str, len + 1); msg->str_len = len; la->tail += sizeof(struct logmsg) + len + 1; } ops.sem_op = 1; if (semop(la->semid, &ops, 1) < 0) { syslog(LOG_ERR, "semop down failed: %m"); return; } } else { char str_final[MAX_MSG_SIZE]; init_logmsg(msg, &tv, prio, func, line); len = format->formatter(str_final, sizeof(str_final) - 1, msg); str_final[len++] = '\n'; xwrite(fileno(stderr), str_final, len); fflush(stderr); } } static void rotate_log(void) { int new_fd; if (access(log_nowname, R_OK) == 0) { char old_logfile[256]; time_t t; struct tm tm; time(&t); localtime_r((const time_t *)&t, &tm); snprintf(old_logfile, sizeof(old_logfile), "%s.%04d-%02d-%02d-%02d-%02d", log_nowname, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min); rename(log_nowname, old_logfile); } new_fd = open(log_nowname, O_RDWR | O_CREAT | O_APPEND, 0644); if (new_fd < 0) { syslog(LOG_ERR, "failed to create new log file\n"); exit(1); } if (dup2(new_fd, log_fd) < 0) { syslog(LOG_ERR, "failed to dup2 the log fd\n"); exit(1); } close(new_fd); } void log_write(int prio, const char *func, int line, const char *fmt, ...) { va_list ap; if (prio > sd_log_level) return; va_start(ap, fmt); dolog(prio, func, line, fmt, ap); va_end(ap); } static void log_flush(void) { struct sembuf ops; size_t size, done = 0; const struct logmsg *msg; if (la->tail == la->start) return; ops.sem_num = 0; ops.sem_flg = SEM_UNDO; ops.sem_op = -1; if (semop(la->semid, &ops, 1) < 0) { syslog(LOG_ERR, "semop up failed: %m"); exit(1); } size = la->tail - la->start; memcpy(log_buff, la->start, size); memset(la->start, 0, size); la->tail = la->start; ops.sem_op = 1; if (semop(la->semid, &ops, 1) < 0) { syslog(LOG_ERR, "semop down failed: %m"); exit(1); } while (done < size) { msg = (const struct logmsg *)(log_buff + done); log_syslog(msg); done += sizeof(*msg) + msg->str_len + 1; } } static bool is_sheep_dead(int signo) { return signo == SIGHUP; } static void crash_handler(int signo) { if (is_sheep_dead(signo)) sd_err("sheep pid %d exited unexpectedly.", sheep_pid); else { sd_err("logger pid %d exits unexpectedly (%s).", getpid(), strsignal(signo)); sd_backtrace(); } log_flush(); closelog(); free_logarea(); /* If the signal isn't caused by the logger crash, we simply exit. */ if (is_sheep_dead(signo)) exit(1); reraise_crash_signal(signo, 1); } static void logger(char *log_dir, char *outfile) { int fd; log_buff = xzalloc(la->end - la->start); log_fd = open(outfile, O_CREAT | O_RDWR | O_APPEND, 0644); if (log_fd < 0) { syslog(LOG_ERR, "failed to open %s\n", outfile); exit(1); } la->active = true; fd = open("/dev/null", O_RDWR); if (fd < 0) { syslog(LOG_ERR, "failed to open /dev/null: %m\n"); exit(1); } dup2(fd, 0); dup2(fd, 1); dup2(fd, 2); setsid(); if (chdir(log_dir) < 0) { syslog(LOG_ERR, "failed to chdir to %s: %m\n", log_dir); exit(1); } /* flush when either the logger or its parent dies */ install_crash_handler(crash_handler); install_sighandler(SIGHUP, crash_handler, false); prctl(PR_SET_PDEATHSIG, SIGHUP); /* * we need to check the aliveness of the sheep process since * it could die before the logger call prctl. */ if (kill(sheep_pid, 0) < 0) kill(logger_pid, SIGHUP); while (la->active) { log_flush(); if (max_logsize) { off_t offset; pthread_mutex_lock(&logsize_lock); offset = lseek(log_fd, 0, SEEK_END); if (offset < 0) { syslog(LOG_ERR, "sheep log error\n"); } else { size_t log_size = (size_t)offset; if (log_size >= max_logsize) rotate_log(); } pthread_mutex_unlock(&logsize_lock); } sleep(1); } log_flush(); free(log_buff); free_logarea(); exit(0); } void early_log_init(const char *format_name, struct logger_user_info *user_info) { struct log_format *f; logger_user_info = user_info; list_for_each_entry(f, &log_formats, list) { if (!strcmp(f->name, format_name)) { format = f; return; } } sd_err("invalid log format: %s", format_name); sd_err("valid options are:"); list_for_each_entry(f, &log_formats, list) { sd_err("\t%s", f->name); } exit(1); } int log_init(const char *program_name, bool to_stdout, int level, char *outfile) { char log_dir[PATH_MAX], tmp[PATH_MAX]; int size = level == SDOG_DEBUG ? LOG_SPACE_DEBUG_SIZE : LOG_SPACE_SIZE; sd_log_level = level; log_name = program_name; log_nowname = outfile; pstrcpy(tmp, sizeof(tmp), outfile); pstrcpy(log_dir, sizeof(log_dir), dirname(tmp)); semkey = random(); if (to_stdout) { if (is_stdout_console()) colorize = true; } else { if (logarea_init(size)) { syslog(LOG_ERR, "failed to initialize the logger\n"); return 1; } /* * Store the pid of the sheep process for use by the death * signal handler. By the time the child is notified of * the parents death the parent has been reparanted to init * and getppid() will always return 1. */ sheep_pid = getpid(); logger_pid = fork(); if (logger_pid < 0) { syslog(LOG_ERR, "failed to fork the logger process: %m\n"); return 1; } if (logger_pid) syslog(LOG_WARNING, "logger pid %d starting\n", logger_pid); else logger(log_dir, outfile); } return 0; } void log_close(void) { if (la) { la->active = false; waitpid(logger_pid, NULL, 0); syslog(LOG_WARNING, "logger pid %d stopped\n", logger_pid); closelog(); free_logarea(); } } void set_thread_name(const char *name, bool show_idx) { worker_name = name; if (show_idx) worker_idx = gettid(); } void get_thread_name(char *name) { format_thread_name(name, MAX_THREAD_NAME_LEN, worker_name, worker_idx); } #define SD_MAX_STACK_DEPTH 1024 static int get_my_path(char *path, size_t size) { /* readlink doesn't append '\0', so initialize here */ memset(path, 0, size); return readlink("/proc/self/exe", path, size); } static bool check_gdb(void) { return system("which gdb > /dev/null") == 0; } /* * __builtin_frame_address() returns address in frame pointer register if any * (e.g, in x86 it returns EBP). If no dedicated register, the frame address is * normally the address of the first word pushed on to the stack by the function * * For a normal subroutine setup, above the value __builtin_frame_address * returns, there are two addresses, which stores old EBP and old EIP, being * pushed on to the stack. So we have to plus 2 to get the right value for the * frame address, which is expected by GDB. * * This is tested on X86, other architetures aren't tested. But even if this * formula is wrong, GDB just doesn't procude anything useful after panic. */ #define FRAME_POINTER ((unsigned long *)__builtin_frame_address(0) + 2) __attribute__ ((__noinline__)) int __sd_dump_variable(const char *var) { char cmd[ARG_MAX], path[PATH_MAX], info[256]; FILE *f = NULL; void *base_sp = FRAME_POINTER; if (!check_gdb()) { sd_debug("cannot find gdb"); return -1; } if (get_my_path(path, sizeof(path)) < 0) return -1; snprintf(cmd, sizeof(cmd), "gdb -nw %s %d -batch -ex 'set width 80'" " -ex 'select-frame %p' -ex 'up 1' -ex 'p %s' 2> /dev/null", path, gettid(), base_sp, var); f = popen(cmd, "r"); if (f == NULL) { sd_err("failed to run gdb"); return -1; } /* * The expected outputs of gdb are: * * [some info we don't need] * $1 = { * * } */ sd_emerg("dump %s", var); while (fgets(info, sizeof(info), f) != NULL) { if (info[0] == '$') { sd_emerg("%s", info); break; } } while (fgets(info, sizeof(info), f) != NULL) sd_emerg("%s", info); pclose(f); return 0; } __attribute__ ((__noinline__)) static int dump_stack_frames(void) { char path[PATH_MAX]; int i, stack_no = 0; void *base_sp = FRAME_POINTER; if (!check_gdb()) { sd_debug("cannot find gdb"); return -1; } if (get_my_path(path, sizeof(path)) < 0) return -1; for (i = 1; i < SD_MAX_STACK_DEPTH; i++) { char cmd[ARG_MAX], info[256]; FILE *f = NULL; bool found = false; snprintf(cmd, sizeof(cmd), "gdb -nw %s %d -batch" " -ex 'set width 80' -ex 'select-frame %p'" " -ex 'up %d' -ex 'info locals' 2> /dev/null", path, gettid(), base_sp, i); f = popen(cmd, "r"); if (f == NULL) return -1; /* * The expected outputs of gdb are: * * [some info we don't need] * # in () at : * * */ while (fgets(info, sizeof(info), f) != NULL) { int no; if (sscanf(info, "#%d ", &no) == 1) { if (no <= stack_no) { /* reached to the end of the stacks */ pclose(f); return 0; } stack_no = no; found = true; sd_emerg("%s", info); break; } } if (!found) { sd_info("Cannot get info from GDB"); sd_info("Set /proc/sys/kernel/yama/ptrace_scope to" " zero if you are using Ubuntu."); pclose(f); return -1; } while (fgets(info, sizeof(info), f) != NULL) sd_emerg("%s", info); pclose(f); } return 0; } __attribute__ ((__noinline__)) void sd_backtrace(void) { void *addrs[SD_MAX_STACK_DEPTH]; int i, n = backtrace(addrs, ARRAY_SIZE(addrs)); for (i = 1; i < n; i++) { /* addrs[0] is here, so skip it */ void *addr = addrs[i]; char cmd[ARG_MAX], path[PATH_MAX], info[256], **str; FILE *f; /* * The called function is at the previous address * because addr contains a return address */ addr = (void *)((char *)addr - 1); /* try to get a line number with addr2line if possible */ if (get_my_path(path, sizeof(path)) < 0) goto fallback; snprintf(cmd, sizeof(cmd), "addr2line -s -e %s -f -i %p | " "perl -e '@a=<>; chomp @a; print \"$a[1]: $a[0]\"'", path, addr); f = popen(cmd, "r"); if (!f) goto fallback; if (fgets(info, sizeof(info), f) == NULL) goto fallback_close; if (info[0] != '?' && info[0] != '\0') sd_emerg("%s", info); else goto fallback_close; pclose(f); continue; /* * Failed to get a line number, so simply use * backtrace_symbols instead */ fallback_close: pclose(f); fallback: str = backtrace_symbols(&addr, 1); sd_emerg("%s", *str); free(str); } /* dump the stack frames if possible*/ dump_stack_frames(); } sheepdog-0.7.5/lib/net.c000066400000000000000000000321041223630776600150350ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sheepdog_proto.h" #include "sheep.h" #include "util.h" #include "event.h" #include "net.h" int conn_tx_off(struct connection *conn) { conn->events &= ~EPOLLOUT; return modify_event(conn->fd, conn->events); } int conn_tx_on(struct connection *conn) { conn->events |= EPOLLOUT; return modify_event(conn->fd, conn->events); } int conn_rx_off(struct connection *conn) { conn->events &= ~EPOLLIN; return modify_event(conn->fd, conn->events); } int conn_rx_on(struct connection *conn) { conn->events |= EPOLLIN; return modify_event(conn->fd, conn->events); } bool is_conn_dead(const struct connection *conn) { if (conn->c_rx_state == C_IO_CLOSED || conn->c_tx_state == C_IO_CLOSED) return true; else return false; } int rx(struct connection *conn, enum conn_state next_state) { int ret; ret = read(conn->fd, conn->rx_buf, conn->rx_length); if (!ret) { conn->c_rx_state = C_IO_CLOSED; return 0; } if (ret < 0) { if (errno != EAGAIN && errno != EINTR) conn->c_rx_state = C_IO_CLOSED; return 0; } conn->rx_length -= ret; conn->rx_buf = (char *)conn->rx_buf + ret; if (!conn->rx_length) conn->c_rx_state = next_state; return ret; } int tx(struct connection *conn, enum conn_state next_state) { int ret; ret = write(conn->fd, conn->tx_buf, conn->tx_length); if (ret < 0) { if (errno != EAGAIN && errno != EINTR) conn->c_tx_state = C_IO_CLOSED; return 0; } conn->tx_length -= ret; conn->tx_buf = (char *)conn->tx_buf + ret; if (!conn->tx_length) conn->c_tx_state = next_state; return ret; } int create_listen_ports(const char *bindaddr, int port, int (*callback)(int fd, void *), void *data) { char servname[64]; int fd, ret, opt; int success = 0; struct addrinfo hints, *res, *res0; memset(servname, 0, sizeof(servname)); snprintf(servname, sizeof(servname), "%d", port); memset(&hints, 0, sizeof(hints)); hints.ai_socktype = SOCK_STREAM; hints.ai_flags = AI_PASSIVE; ret = getaddrinfo(bindaddr, servname, &hints, &res0); if (ret) { sd_err("failed to get address info: %m"); return 1; } for (res = res0; res; res = res->ai_next) { fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); if (fd < 0) continue; opt = 1; ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); if (ret) sd_err("failed to set SO_REUSEADDR: %m"); opt = 1; if (res->ai_family == AF_INET6) { ret = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &opt, sizeof(opt)); if (ret) { close(fd); continue; } } ret = bind(fd, res->ai_addr, res->ai_addrlen); if (ret) { sd_err("failed to bind server socket: %m"); close(fd); continue; } ret = listen(fd, SOMAXCONN); if (ret) { sd_err("failed to listen on server socket: %m"); close(fd); continue; } ret = set_nonblocking(fd); if (ret < 0) { close(fd); continue; } ret = callback(fd, data); if (ret) { close(fd); continue; } success++; } freeaddrinfo(res0); if (!success) sd_err("failed to create a listening port"); return !success; } int connect_to(const char *name, int port) { char buf[64]; char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV]; int fd, ret; struct addrinfo hints, *res, *res0; struct linger linger_opt = {1, 0}; memset(&hints, 0, sizeof(hints)); snprintf(buf, sizeof(buf), "%d", port); hints.ai_socktype = SOCK_STREAM; ret = getaddrinfo(name, buf, &hints, &res0); if (ret) { sd_err("failed to get address info: %m"); return -1; } for (res = res0; res; res = res->ai_next) { ret = getnameinfo(res->ai_addr, res->ai_addrlen, hbuf, sizeof(hbuf), sbuf, sizeof(sbuf), NI_NUMERICHOST | NI_NUMERICSERV); if (ret) continue; fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); if (fd < 0) continue; ret = setsockopt(fd, SOL_SOCKET, SO_LINGER, &linger_opt, sizeof(linger_opt)); if (ret) { sd_err("failed to set SO_LINGER: %m"); close(fd); continue; } ret = set_snd_timeout(fd); if (ret) { sd_err("failed to set send timeout: %m"); close(fd); break; } ret = set_rcv_timeout(fd); if (ret) { sd_err("failed to set recv timeout: %m"); close(fd); break; } reconnect: ret = connect(fd, res->ai_addr, res->ai_addrlen); if (ret) { if (errno == EINTR) goto reconnect; sd_err("failed to connect to %s:%d: %m", name, port); close(fd); continue; } ret = set_nodelay(fd); if (ret) { sd_err("%m"); close(fd); break; } else goto success; } fd = -1; success: freeaddrinfo(res0); sd_debug("%d, %s:%d", fd, name, port); return fd; } int do_read(int sockfd, void *buf, int len, bool (*need_retry)(uint32_t epoch), uint32_t epoch, uint32_t max_count) { int ret, repeat = max_count; reread: ret = read(sockfd, buf, len); if (ret == 0) { sd_err("connection is closed (%d bytes left)", len); return 1; } if (ret < 0) { if (errno == EINTR) goto reread; /* * Since we set timeout for read, we'll get EAGAIN even for * blocking sockfd. */ if (errno == EAGAIN && repeat && (need_retry == NULL || need_retry(epoch))) { repeat--; goto reread; } sd_err("failed to read from socket: %d, %m", ret); return 1; } len -= ret; buf = (char *)buf + ret; if (len) goto reread; return 0; } static void forward_iov(struct msghdr *msg, int len) { while (msg->msg_iov->iov_len <= len) { len -= msg->msg_iov->iov_len; msg->msg_iov++; msg->msg_iovlen--; } msg->msg_iov->iov_base = (char *) msg->msg_iov->iov_base + len; msg->msg_iov->iov_len -= len; } static int do_write(int sockfd, struct msghdr *msg, int len, bool (*need_retry)(uint32_t), uint32_t epoch, uint32_t max_count) { int ret, repeat = max_count; rewrite: ret = sendmsg(sockfd, msg, 0); if (ret < 0) { if (errno == EINTR) goto rewrite; /* * Since we set timeout for write, we'll get EAGAIN even for * blocking sockfd. */ if (errno == EAGAIN && repeat && (need_retry == NULL || need_retry(epoch))) { repeat--; goto rewrite; } sd_err("failed to write to socket: %m"); return 1; } len -= ret; if (len) { forward_iov(msg, ret); goto rewrite; } return 0; } int send_req(int sockfd, struct sd_req *hdr, void *data, unsigned int wlen, bool (*need_retry)(uint32_t epoch), uint32_t epoch, uint32_t max_count) { int ret; struct msghdr msg; struct iovec iov[2]; memset(&msg, 0, sizeof(msg)); msg.msg_iov = iov; msg.msg_iovlen = 1; iov[0].iov_base = hdr; iov[0].iov_len = sizeof(*hdr); if (wlen) { msg.msg_iovlen++; iov[1].iov_base = data; iov[1].iov_len = wlen; } ret = do_write(sockfd, &msg, sizeof(*hdr) + wlen, need_retry, epoch, max_count); if (ret) { sd_err("failed to send request %x, %d: %m", hdr->opcode, wlen); ret = -1; } return ret; } int exec_req(int sockfd, struct sd_req *hdr, void *data, bool (*need_retry)(uint32_t epoch), uint32_t epoch, uint32_t max_count) { int ret; struct sd_rsp *rsp = (struct sd_rsp *)hdr; unsigned int wlen, rlen; if (hdr->flags & SD_FLAG_CMD_WRITE) { wlen = hdr->data_length; rlen = 0; } else { wlen = 0; rlen = hdr->data_length; } if (send_req(sockfd, hdr, data, wlen, need_retry, epoch, max_count)) return 1; ret = do_read(sockfd, rsp, sizeof(*rsp), need_retry, epoch, max_count); if (ret) { sd_err("failed to read a response"); return 1; } if (rlen > rsp->data_length) rlen = rsp->data_length; if (rlen) { ret = do_read(sockfd, data, rlen, need_retry, epoch, max_count); if (ret) { sd_err("failed to read the response data"); return 1; } } return 0; } const char *addr_to_str(const uint8_t *addr, uint16_t port) { static __thread char str[HOST_NAME_MAX + 8]; int af = AF_INET6; int addr_start_idx = 0; const char *ret; /* Find address family type */ if (addr[12]) { int oct_no = 0; while (!addr[oct_no] && oct_no++ < 12) ; if (oct_no == 12) { af = AF_INET; addr_start_idx = 12; } } ret = inet_ntop(af, addr + addr_start_idx, str, sizeof(str)); if (unlikely(ret == NULL)) panic("failed to convert addr to string, %m"); if (port) { int len = strlen(str); snprintf(str + len, sizeof(str) - len, ":%d", port); } return str; } char *sockaddr_in_to_str(struct sockaddr_in *sockaddr) { int i, si; static char str[32]; uint8_t *addr; si = 0; memset(str, 0, 32); addr = (uint8_t *)&sockaddr->sin_addr.s_addr; for (i = 0; i < 4; i++) { si += snprintf(str + si, 32 - si, i != 3 ? "%d." : "%d", addr[i]); } snprintf(str + si, 32 - si, ":%u", sockaddr->sin_port); return str; } uint8_t *str_to_addr(const char *ipstr, uint8_t *addr) { int addr_start_idx = 0, af = strstr(ipstr, ":") ? AF_INET6 : AF_INET; if (af == AF_INET) { addr_start_idx = 12; memset(addr, 0, addr_start_idx); } if (!inet_pton(af, ipstr, addr + addr_start_idx)) return NULL; return addr; } int set_nonblocking(int fd) { int ret; ret = fcntl(fd, F_GETFL); if (ret < 0) { sd_err("fcntl F_GETFL failed: %m"); close(fd); } else { ret = fcntl(fd, F_SETFL, ret | O_NONBLOCK); if (ret < 0) sd_err("fcntl O_NONBLOCK failed: %m"); } return ret; } int set_snd_timeout(int fd) { struct timeval timeout; timeout.tv_sec = POLL_TIMEOUT; timeout.tv_usec = 0; return setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout)); } int set_rcv_timeout(int fd) { struct timeval timeout; /* * We should wait longer for read than write because the target node might be * busy doing IO */ timeout.tv_sec = MAX_POLLTIME; timeout.tv_usec = 0; return setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)); } int set_nodelay(int fd) { int ret, opt; opt = 1; ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &opt, sizeof(opt)); return ret; } /* * Timeout after request is issued after 5s. * * Heart-beat message will be sent periodically with 1s interval. * If the node of the other end of fd fails, we'll detect it in 3s */ int set_keepalive(int fd) { int val = 1; if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)) < 0) { sd_debug("%m"); return -1; } val = 5; if (setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, &val, sizeof(val)) < 0) { sd_debug("%m"); return -1; } val = 1; if (setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, &val, sizeof(val)) < 0) { sd_debug("%m"); return -1; } val = 3; if (setsockopt(fd, SOL_TCP, TCP_KEEPCNT, &val, sizeof(val)) < 0) { sd_debug("%m"); return -1; } return 0; } int get_local_addr(uint8_t *bytes) { struct ifaddrs *ifaddr, *ifa; int ret = 0; if (getifaddrs(&ifaddr) == -1) { sd_err("getifaddrs failed: %m"); return -1; } for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { struct sockaddr_in *sin; struct sockaddr_in6 *sin6; if (ifa->ifa_flags & IFF_LOOPBACK) continue; if (!ifa->ifa_addr) continue; switch (ifa->ifa_addr->sa_family) { case AF_INET: sin = (struct sockaddr_in *)ifa->ifa_addr; memset(bytes, 0, 12); memcpy(bytes + 12, &sin->sin_addr, 4); memcpy(bytes + 12, &sin->sin_addr, 4); sd_err("found IPv4 address"); goto out; case AF_INET6: sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; memcpy(bytes, &sin6->sin6_addr, 16); sd_err("found IPv6 address"); goto out; } } sd_err("no valid interface found"); ret = -1; out: freeifaddrs(ifaddr); return ret; } int create_unix_domain_socket(const char *unix_path, int (*callback)(int, void *), void *data) { int fd, ret; struct sockaddr_un addr; addr.sun_family = AF_UNIX; pstrcpy(addr.sun_path, sizeof(addr.sun_path), unix_path); fd = socket(addr.sun_family, SOCK_STREAM, 0); if (fd < 0) { sd_err("failed to create socket, %m"); return -1; } ret = bind(fd, &addr, sizeof(addr)); if (ret) { sd_err("failed to bind socket: %m"); goto err; } ret = listen(fd, SOMAXCONN); if (ret) { sd_err("failed to listen on socket: %m"); goto err; } ret = set_nonblocking(fd); if (ret < 0) goto err; ret = callback(fd, data); if (ret) goto err; return 0; err: close(fd); return -1; } bool inetaddr_is_valid(char *addr) { unsigned char buf[INET6_ADDRSTRLEN]; int af; af = strstr(addr, ":") ? AF_INET6 : AF_INET; if (!inet_pton(af, addr, buf)) { sd_err("Bad address '%s'", addr); return false; } return true; } int do_writev2(int fd, void *hdr, size_t hdr_len, void *body, size_t body_len) { struct iovec iov[2]; iov[0].iov_base = hdr; iov[0].iov_len = hdr_len; iov[1].iov_base = body; iov[1].iov_len = body_len; return writev(fd, iov, 2); } sheepdog-0.7.5/lib/option.c000066400000000000000000000023401223630776600155560ustar00rootroot00000000000000/* * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "option.h" char *build_short_options(const struct sd_option *sd_opts) { static char sopts[256], *p; const struct sd_option *opt; p = sopts; sd_for_each_option(opt, sd_opts) { *p++ = opt->ch; if (opt->has_arg) *p++ = ':'; } *p = '\0'; return sopts; } struct option *build_long_options(const struct sd_option *sd_opts) { static struct option lopts[256], *p; const struct sd_option *opt; p = lopts; sd_for_each_option(opt, sd_opts) { p->name = opt->name; p->has_arg = opt->has_arg; p->flag = NULL; p->val = opt->ch; p++; } memset(p, 0, sizeof(struct option)); return lopts; } const char *option_get_help(const struct sd_option *sd_opts, int ch) { const struct sd_option *opt; sd_for_each_option(opt, sd_opts) { if (opt->ch == ch) return opt->help; } return NULL; } sheepdog-0.7.5/lib/rbtree.c000066400000000000000000000232701223630776600155360ustar00rootroot00000000000000/* Red Black Trees (C) 1999 Andrea Arcangeli (C) 2002 David Woodhouse This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include "rbtree.h" static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) { struct rb_node *right = node->rb_right; struct rb_node *parent = rb_parent(node); node->rb_right = right->rb_left; if (node->rb_right) rb_set_parent(right->rb_left, node); right->rb_left = node; rb_set_parent(right, parent); if (parent) { if (node == parent->rb_left) parent->rb_left = right; else parent->rb_right = right; } else root->rb_node = right; rb_set_parent(node, right); } static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) { struct rb_node *left = node->rb_left; struct rb_node *parent = rb_parent(node); node->rb_left = left->rb_right; if (node->rb_left) rb_set_parent(left->rb_right, node); left->rb_right = node; rb_set_parent(left, parent); if (parent) { if (node == parent->rb_right) parent->rb_right = left; else parent->rb_left = left; } else root->rb_node = left; rb_set_parent(node, left); } void rb_insert_color(struct rb_node *node, struct rb_root *root) { struct rb_node *parent, *gparent; while ((parent = rb_parent(node)) && rb_is_red(parent)) { gparent = rb_parent(parent); if (parent == gparent->rb_left) { register struct rb_node *uncle = gparent->rb_right; if (uncle && rb_is_red(uncle)) { rb_set_black(uncle); rb_set_black(parent); rb_set_red(gparent); node = gparent; continue; } if (parent->rb_right == node) { register struct rb_node *tmp; __rb_rotate_left(parent, root); tmp = parent; parent = node; node = tmp; } rb_set_black(parent); rb_set_red(gparent); __rb_rotate_right(gparent, root); } else { register struct rb_node *uncle = gparent->rb_left; if (uncle && rb_is_red(uncle)) { rb_set_black(uncle); rb_set_black(parent); rb_set_red(gparent); node = gparent; continue; } if (parent->rb_left == node) { register struct rb_node *tmp; __rb_rotate_right(parent, root); tmp = parent; parent = node; node = tmp; } rb_set_black(parent); rb_set_red(gparent); __rb_rotate_left(gparent, root); } } rb_set_black(root->rb_node); } static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, struct rb_root *root) { struct rb_node *other; while ((!node || rb_is_black(node)) && node != root->rb_node) { if (parent->rb_left == node) { other = parent->rb_right; if (rb_is_red(other)) { rb_set_black(other); rb_set_red(parent); __rb_rotate_left(parent, root); other = parent->rb_right; } if ((!other->rb_left || rb_is_black(other->rb_left)) && (!other->rb_right || rb_is_black(other->rb_right))) { rb_set_red(other); node = parent; parent = rb_parent(node); } else { if (!other->rb_right || rb_is_black(other->rb_right)) { rb_set_black(other->rb_left); rb_set_red(other); __rb_rotate_right(other, root); other = parent->rb_right; } rb_set_color(other, rb_color(parent)); rb_set_black(parent); rb_set_black(other->rb_right); __rb_rotate_left(parent, root); node = root->rb_node; break; } } else { other = parent->rb_left; if (rb_is_red(other)) { rb_set_black(other); rb_set_red(parent); __rb_rotate_right(parent, root); other = parent->rb_left; } if ((!other->rb_left || rb_is_black(other->rb_left)) && (!other->rb_right || rb_is_black(other->rb_right))) { rb_set_red(other); node = parent; parent = rb_parent(node); } else { if (!other->rb_left || rb_is_black(other->rb_left)) { rb_set_black(other->rb_right); rb_set_red(other); __rb_rotate_left(other, root); other = parent->rb_left; } rb_set_color(other, rb_color(parent)); rb_set_black(parent); rb_set_black(other->rb_left); __rb_rotate_right(parent, root); node = root->rb_node; break; } } } if (node) rb_set_black(node); } void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *child, *parent; int color; if (!node->rb_left) child = node->rb_right; else if (!node->rb_right) child = node->rb_left; else { struct rb_node *old = node, *left; node = node->rb_right; while ((left = node->rb_left) != NULL) node = left; if (rb_parent(old)) { if (rb_parent(old)->rb_left == old) rb_parent(old)->rb_left = node; else rb_parent(old)->rb_right = node; } else root->rb_node = node; child = node->rb_right; parent = rb_parent(node); color = rb_color(node); if (parent == old) parent = node; else { if (child) rb_set_parent(child, parent); parent->rb_left = child; node->rb_right = old->rb_right; rb_set_parent(old->rb_right, node); } node->rb_parent_color = old->rb_parent_color; node->rb_left = old->rb_left; rb_set_parent(old->rb_left, node); goto color; } parent = rb_parent(node); color = rb_color(node); if (child) rb_set_parent(child, parent); if (parent) { if (parent->rb_left == node) parent->rb_left = child; else parent->rb_right = child; } else root->rb_node = child; color: if (color == RB_BLACK) __rb_erase_color(child, parent, root); } static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data) { struct rb_node *parent; up: func(node, data); parent = rb_parent(node); if (!parent) return; if (node == parent->rb_left && parent->rb_right) func(parent->rb_right, data); else if (parent->rb_left) func(parent->rb_left, data); node = parent; goto up; } /* * after inserting @node into the tree, update the tree to account for * both the new entry and any damage done by rebalance */ void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data) { if (node->rb_left) node = node->rb_left; else if (node->rb_right) node = node->rb_right; rb_augment_path(node, func, data); } /* * before removing the node, find the deepest node on the rebalance path * that will still be there after @node gets removed */ struct rb_node *rb_augment_erase_begin(struct rb_node *node) { struct rb_node *deepest; if (!node->rb_right && !node->rb_left) deepest = rb_parent(node); else if (!node->rb_right) deepest = node->rb_left; else if (!node->rb_left) deepest = node->rb_right; else { deepest = rb_next(node); if (deepest->rb_right) deepest = deepest->rb_right; else if (rb_parent(deepest) != node) deepest = rb_parent(deepest); } return deepest; } /* * after removal, update the tree to account for the removed entry * and any rebalance damage. */ void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data) { if (node) rb_augment_path(node, func, data); } /* This function returns the first node (in sort order) of the tree. */ struct rb_node *rb_first(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_left) n = n->rb_left; return n; } struct rb_node *rb_last(const struct rb_root *root) { struct rb_node *n; n = root->rb_node; if (!n) return NULL; while (n->rb_right) n = n->rb_right; return n; } struct rb_node *rb_next(const struct rb_node *node) { struct rb_node *parent; if (rb_parent(node) == node) return NULL; /* * If we have a right-hand child, go down and then left as far * as we can. */ if (node->rb_right) { node = node->rb_right; while (node->rb_left) node = node->rb_left; return (struct rb_node *)node; } /* * No right-hand children. Everything down and left is * smaller than us, so any 'next' node must be in the general * direction of our parent. Go up the tree; any time the * ancestor is a right-hand child of its parent, keep going * up. First time it's a left-hand child of its parent, said * parent is our 'next' node. */ while ((parent = rb_parent(node)) && node == parent->rb_right) node = parent; return parent; } struct rb_node *rb_prev(const struct rb_node *node) { struct rb_node *parent; if (rb_parent(node) == node) return NULL; /* * If we have a left-hand child, go down and then right as far * as we can. */ if (node->rb_left) { node = node->rb_left; while (node->rb_right) node = node->rb_right; return (struct rb_node *)node; } /* * No left-hand children. Go up till we find an ancestor which * is a right-hand child of its parent */ while ((parent = rb_parent(node)) && node == parent->rb_left) node = parent; return parent; } void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root) { struct rb_node *parent = rb_parent(victim); /* Set the surrounding nodes to point to the replacement */ if (parent) { if (victim == parent->rb_left) parent->rb_left = new; else parent->rb_right = new; } else { root->rb_node = new; } if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) rb_set_parent(victim->rb_right, new); /* Copy the pointers/colour from the victim to the replacement */ *new = *victim; } sheepdog-0.7.5/lib/sha1.c000066400000000000000000000157061223630776600151140ustar00rootroot00000000000000/* * Cryptographic API. * * SHA1 Secure Hash Algorithm. * * Derived from cryptoapi implementation, adapted for in-place * scatterlist interface. Originally based on the public domain * implementation written by Steve Reid. * * Copyright (c) Alan Smithee. * Copyright (c) Andrew McDonald * Copyright (c) Jean-Francois Dive * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #include #include "sha1.h" #include "util.h" static __always_inline uint32_t rol(uint32_t value, uint32_t bits) { return (value << bits) | (value >> (32 - bits)); } /* blk0() and blk() perform the initial expand. */ /* I got the idea of expanding during the round function from SSLeay */ # define blk0(i) block32[i] #define blk(i) \ (block32[i & 15] = rol(block32[(i + 13) & 15] ^ block32[(i + 8) & 15] \ ^ block32[(i + 2) & 15] ^ block32[i & 15], 1)) /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */ #define R0(v, w, x, y, z, i) \ z += ((w & (x ^ y)) ^ y) + blk0(i) + 0x5A827999 + rol(v, 5); \ w = rol(w, 30); #define R1(v, w, x, y, z, i) \ z += ((w & (x ^ y)) ^ y) + blk(i) + 0x5A827999 + rol(v, 5); \ w = rol(w, 30); #define R2(v, w, x, y, z, i) \ z += (w ^ x ^ y) + blk(i) + 0x6ED9EBA1 + rol(v, 5); \ w = rol(w, 30); #define R3(v, w, x, y, z, i) \ z += (((w | x) & y) | (w & x)) + blk(i) + 0x8F1BBCDC + rol(v, 5); \ w = rol(w, 30); #define R4(v, w, x, y, z, i) \ z += (w ^ x ^ y) + blk(i) + 0xCA62C1D6 + rol(v, 5); \ w = rol(w, 30); /* Hash a single 512-bit block. This is the core of the algorithm. */ static void sha1_transform(uint32_t *state, const uint8_t *in) { uint32_t a, b, c, d, e; uint32_t block32[16]; /* convert/copy data to workspace */ for (a = 0; a < sizeof(block32)/sizeof(uint32_t); a++) block32[a] = ntohl(((const uint32_t *)in)[a]); /* Copy context->state[] to working vars */ a = state[0]; b = state[1]; c = state[2]; d = state[3]; e = state[4]; /* 4 rounds of 20 operations each. Loop unrolled. */ R0(a, b, c, d, e, 0); R0(e, a, b, c, d, 1); R0(d, e, a, b, c, 2); R0(c, d, e, a, b, 3); R0(b, c, d, e, a, 4); R0(a, b, c, d, e, 5); R0(e, a, b, c, d, 6); R0(d, e, a, b, c, 7); R0(c, d, e, a, b, 8); R0(b, c, d, e, a, 9); R0(a, b, c, d, e, 10); R0(e, a, b, c, d, 11); R0(d, e, a, b, c, 12); R0(c, d, e, a, b, 13); R0(b, c, d, e, a, 14); R0(a, b, c, d, e, 15); R1(e, a, b, c, d, 16); R1(d, e, a, b, c, 17); R1(c, d, e, a, b, 18); R1(b, c, d, e, a, 19); R2(a, b, c, d, e, 20); R2(e, a, b, c, d, 21); R2(d, e, a, b, c, 22); R2(c, d, e, a, b, 23); R2(b, c, d, e, a, 24); R2(a, b, c, d, e, 25); R2(e, a, b, c, d, 26); R2(d, e, a, b, c, 27); R2(c, d, e, a, b, 28); R2(b, c, d, e, a, 29); R2(a, b, c, d, e, 30); R2(e, a, b, c, d, 31); R2(d, e, a, b, c, 32); R2(c, d, e, a, b, 33); R2(b, c, d, e, a, 34); R2(a, b, c, d, e, 35); R2(e, a, b, c, d, 36); R2(d, e, a, b, c, 37); R2(c, d, e, a, b, 38); R2(b, c, d, e, a, 39); R3(a, b, c, d, e, 40); R3(e, a, b, c, d, 41); R3(d, e, a, b, c, 42); R3(c, d, e, a, b, 43); R3(b, c, d, e, a, 44); R3(a, b, c, d, e, 45); R3(e, a, b, c, d, 46); R3(d, e, a, b, c, 47); R3(c, d, e, a, b, 48); R3(b, c, d, e, a, 49); R3(a, b, c, d, e, 50); R3(e, a, b, c, d, 51); R3(d, e, a, b, c, 52); R3(c, d, e, a, b, 53); R3(b, c, d, e, a, 54); R3(a, b, c, d, e, 55); R3(e, a, b, c, d, 56); R3(d, e, a, b, c, 57); R3(c, d, e, a, b, 58); R3(b, c, d, e, a, 59); R4(a, b, c, d, e, 60); R4(e, a, b, c, d, 61); R4(d, e, a, b, c, 62); R4(c, d, e, a, b, 63); R4(b, c, d, e, a, 64); R4(a, b, c, d, e, 65); R4(e, a, b, c, d, 66); R4(d, e, a, b, c, 67); R4(c, d, e, a, b, 68); R4(b, c, d, e, a, 69); R4(a, b, c, d, e, 70); R4(e, a, b, c, d, 71); R4(d, e, a, b, c, 72); R4(c, d, e, a, b, 73); R4(b, c, d, e, a, 74); R4(a, b, c, d, e, 75); R4(e, a, b, c, d, 76); R4(d, e, a, b, c, 77); R4(c, d, e, a, b, 78); R4(b, c, d, e, a, 79); /* Add the working vars back into context.state[] */ state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; /* Wipe variables */ a = b = c = d = e = 0; memset(block32, 0x00, sizeof block32); } void sha1_init(void *ctx) { struct sha1_ctx *sctx = ctx; static const struct sha1_ctx init_state = { 0, { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 }, { 0, } }; *sctx = init_state; } void sha1_update(void *ctx, const uint8_t *data, unsigned int len) { struct sha1_ctx *sctx = ctx; unsigned int i, j; j = (sctx->count >> 3) & 0x3f; sctx->count += len << 3; if ((j + len) > 63) { memcpy(&sctx->buffer[j], data, (i = 64-j)); sha1_transform(sctx->state, sctx->buffer); for ( ; i + 63 < len; i += 64) sha1_transform(sctx->state, &data[i]); j = 0; } else i = 0; memcpy(&sctx->buffer[j], &data[i], len - i); } /* Add padding and return the message digest. */ void sha1_final(void *ctx, uint8_t *out) { struct sha1_ctx *sctx = ctx; uint32_t i, j, idx, padlen; uint64_t t; uint8_t bits[8] = { 0, }; static const uint8_t padding[64] = { 0x80, }; t = sctx->count; bits[7] = 0xff & t; t >>= 8; bits[6] = 0xff & t; t >>= 8; bits[5] = 0xff & t; t >>= 8; bits[4] = 0xff & t; t >>= 8; bits[3] = 0xff & t; t >>= 8; bits[2] = 0xff & t; t >>= 8; bits[1] = 0xff & t; t >>= 8; bits[0] = 0xff & t; /* Pad out to 56 mod 64 */ idx = (sctx->count >> 3) & 0x3f; padlen = (idx < 56) ? (56 - idx) : ((64+56) - idx); sha1_update(sctx, padding, padlen); /* Append length */ sha1_update(sctx, bits, sizeof bits); /* Store state in digest */ for (i = j = 0; i < 5; i++, j += 4) { uint32_t t2 = sctx->state[i]; out[j+3] = t2 & 0xff; t2 >>= 8; out[j+2] = t2 & 0xff; t2 >>= 8; out[j+1] = t2 & 0xff; t2 >>= 8; out[j] = t2 & 0xff; } /* Wipe context */ memset(sctx, 0, sizeof *sctx); } const char *sha1_to_hex(const unsigned char *sha1) { static __thread char buffer[50]; static const char hex[] = "0123456789abcdef"; char *buf = buffer; int i; for (i = 0; i < SHA1_DIGEST_SIZE; i++) { unsigned int val = *sha1++; *buf++ = hex[val >> 4]; *buf++ = hex[val & 0xf]; } return buffer; } /* * Calculate a sha1 message digest based on the content of 'buf' * * We can uniquely generate the original buffer from * - the trimmed buffer * - the orignal buffer length * - the trimmed buffer length * - the trimmed buffer offset * * This calculates a unique sha1 digest faster than the naive calculation when * the content of 'buf' is sparse. The result will be set in 'sha1'. */ void sha1_from_buffer(const void *buf, size_t size, unsigned char *sha1) { struct sha1_ctx c; uint64_t offset = 0; uint32_t length = size; sha1_init(&c); sha1_update(&c, (uint8_t *)&length, sizeof(length)); find_zero_blocks(buf, &offset, &length); sha1_update(&c, (uint8_t *)&length, sizeof(length)); sha1_update(&c, (uint8_t *)&offset, sizeof(offset)); sha1_update(&c, buf, length); sha1_final(&c, sha1); } sheepdog-0.7.5/lib/sockfd_cache.c000066400000000000000000000310671223630776600166520ustar00rootroot00000000000000/* * Copyright (C) 2012-2013 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* * The sockfd cache provides us long TCP connections connected to the nodes * in the cluster to accerlater the data transfer, which has the following * characteristics: * 0 dynamically allocated/deallocated at node granularity. * 1 cached fds are multiplexed by all threads. * 2 each session (for e.g, forward_write_obj_req) can grab one fd at a time. * 3 if there isn't any FD available from cache, use normal connect_to() and * close() internally. * 4 FD are named by IP:PORT uniquely, hence no need of resetting at * membership change. * 5 the total number of FDs is scalable to massive nodes. * 6 total 3 APIs: sheep_{get,put,del}_sockfd(). * 7 support dual connections to a single node. */ #include #include "sockfd_cache.h" #include "work.h" #include "rbtree.h" #include "util.h" #include "sheep.h" struct sockfd_cache { struct rb_root root; struct sd_lock lock; int count; }; static struct sockfd_cache sockfd_cache = { .root = RB_ROOT, .lock = SD_LOCK_INITIALIZER, }; /* * Suppose request size from Guest is 512k, then 4M / 512k = 8, so at * most 8 requests can be issued to the same sheep object. Based on this * assumption, '8' would be effecient for servers that only host 2~4 * Guests. * * This fd count will be dynamically grown when the idx reaches watermark which * is calculated by FDS_WATERMARK */ #define FDS_WATERMARK(x) ((x) * 3 / 4) #define DEFAULT_FDS_COUNT 8 /* How many FDs we cache for one node */ static int fds_count = DEFAULT_FDS_COUNT; struct sockfd_cache_fd { int fd; uatomic_bool in_use; }; struct sockfd_cache_entry { struct rb_node rb; struct node_id nid; struct sockfd_cache_fd *fds; }; static struct sockfd_cache_entry * sockfd_cache_insert(struct sockfd_cache_entry *new) { struct rb_node **p = &sockfd_cache.root.rb_node; struct rb_node *parent = NULL; struct sockfd_cache_entry *entry; while (*p) { int cmp; parent = *p; entry = rb_entry(parent, struct sockfd_cache_entry, rb); cmp = node_id_cmp(&new->nid, &entry->nid); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) p = &(*p)->rb_right; else return entry; } rb_link_node(&new->rb, parent, p); rb_insert_color(&new->rb, &sockfd_cache.root); return NULL; /* insert successfully */ } static struct sockfd_cache_entry *sockfd_cache_search(const struct node_id *nid) { struct rb_node *n = sockfd_cache.root.rb_node; struct sockfd_cache_entry *t; while (n) { int cmp; t = rb_entry(n, struct sockfd_cache_entry, rb); cmp = node_id_cmp(nid, &t->nid); if (cmp < 0) n = n->rb_left; else if (cmp > 0) n = n->rb_right; else return t; /* found it */ } return NULL; } static inline int get_free_slot(struct sockfd_cache_entry *entry) { int idx = -1, i; for (i = 0; i < fds_count; i++) { if (!uatomic_set_true(&entry->fds[i].in_use)) continue; idx = i; break; } return idx; } /* * Grab a free slot of the node and inc the refcount of the slot * * If no free slot available, this typically means we should use short FD. */ static struct sockfd_cache_entry *sockfd_cache_grab(const struct node_id *nid, int *ret_idx) { struct sockfd_cache_entry *entry; sd_read_lock(&sockfd_cache.lock); entry = sockfd_cache_search(nid); if (!entry) { sd_debug("failed node %s", addr_to_str(nid->addr, nid->port)); goto out; } *ret_idx = get_free_slot(entry); if (*ret_idx == -1) entry = NULL; out: sd_unlock(&sockfd_cache.lock); return entry; } static inline bool slots_all_free(struct sockfd_cache_entry *entry) { int i; for (i = 0; i < fds_count; i++) if (uatomic_is_true(&entry->fds[i].in_use)) return false; return true; } static inline void destroy_all_slots(struct sockfd_cache_entry *entry) { int i; for (i = 0; i < fds_count; i++) if (entry->fds[i].fd != -1) close(entry->fds[i].fd); } static void free_cache_entry(struct sockfd_cache_entry *entry) { free(entry->fds); free(entry); } /* * Destroy all the Cached FDs of the node * * We don't proceed if some other node grab one FD of the node. In this case, * the victim node will finally find itself talking to a dead node and call * sockfd_cache_del() to delete this node from the cache. */ static bool sockfd_cache_destroy(const struct node_id *nid) { struct sockfd_cache_entry *entry; sd_write_lock(&sockfd_cache.lock); entry = sockfd_cache_search(nid); if (!entry) { sd_debug("It is already destroyed"); goto false_out; } if (!slots_all_free(entry)) { sd_debug("Some victim still holds it"); goto false_out; } rb_erase(&entry->rb, &sockfd_cache.root); sd_unlock(&sockfd_cache.lock); destroy_all_slots(entry); free_cache_entry(entry); return true; false_out: sd_unlock(&sockfd_cache.lock); return false; } static void sockfd_cache_add_nolock(const struct node_id *nid) { struct sockfd_cache_entry *new = xmalloc(sizeof(*new)); int i; new->fds = xzalloc(sizeof(struct sockfd_cache_fd) * fds_count); for (i = 0; i < fds_count; i++) new->fds[i].fd = -1; memcpy(&new->nid, nid, sizeof(struct node_id)); if (sockfd_cache_insert(new)) { free_cache_entry(new); return; } sockfd_cache.count++; } /* Add group of nodes to the cache */ void sockfd_cache_add_group(const struct sd_node *nodes, int nr) { const struct sd_node *p; sd_debug("%d", nr); sd_write_lock(&sockfd_cache.lock); while (nr--) { p = nodes + nr; sockfd_cache_add_nolock(&p->nid); } sd_unlock(&sockfd_cache.lock); } /* Add one node to the cache means we can do caching tricks on this node */ void sockfd_cache_add(const struct node_id *nid) { struct sockfd_cache_entry *new; int n, i; sd_write_lock(&sockfd_cache.lock); new = xmalloc(sizeof(*new)); new->fds = xzalloc(sizeof(struct sockfd_cache_fd) * fds_count); for (i = 0; i < fds_count; i++) new->fds[i].fd = -1; memcpy(&new->nid, nid, sizeof(struct node_id)); if (sockfd_cache_insert(new)) { free_cache_entry(new); sd_unlock(&sockfd_cache.lock); return; } sd_unlock(&sockfd_cache.lock); n = uatomic_add_return(&sockfd_cache.count, 1); sd_debug("%s, count %d", addr_to_str(nid->addr, nid->port), n); } static uatomic_bool fds_in_grow; static int fds_high_watermark = FDS_WATERMARK(DEFAULT_FDS_COUNT); static struct work_queue *grow_wq; static void do_grow_fds(struct work *work) { struct sockfd_cache_entry *entry; struct rb_node *p; int old_fds_count, new_fds_count, new_size, i; sd_debug("%d", fds_count); sd_write_lock(&sockfd_cache.lock); old_fds_count = fds_count; new_fds_count = fds_count * 2; new_size = sizeof(struct sockfd_cache_fd) * fds_count * 2; for (p = rb_first(&sockfd_cache.root); p; p = rb_next(p)) { entry = rb_entry(p, struct sockfd_cache_entry, rb); entry->fds = xrealloc(entry->fds, new_size); for (i = old_fds_count; i < new_fds_count; i++) { entry->fds[i].fd = -1; uatomic_set_false(&entry->fds[i].in_use); } } fds_count *= 2; fds_high_watermark = FDS_WATERMARK(fds_count); sd_unlock(&sockfd_cache.lock); } static void grow_fds_done(struct work *work) { sd_debug("fd count has been grown into %d", fds_count); uatomic_set_false(&fds_in_grow); free(work); } static inline void check_idx(int idx) { struct work *w; if (idx <= fds_high_watermark) return; if (!uatomic_set_true(&fds_in_grow)) return; w = xmalloc(sizeof(*w)); w->fn = do_grow_fds; w->done = grow_fds_done; queue_work(grow_wq, w); } /* Add the node back if it is still alive */ static inline int revalidate_node(const struct node_id *nid) { bool use_io = nid->io_port ? true : false; int fd; if (use_io) { fd = connect_to_addr(nid->io_addr, nid->io_port); if (fd >= 0) goto alive; } fd = connect_to_addr(nid->addr, nid->port); if (fd < 0) return false; alive: close(fd); sockfd_cache_add(nid); return true; } /* Try to create/get cached IO connection. If failed, fallback to non-IO one */ static struct sockfd *sockfd_cache_get_long(const struct node_id *nid) { struct sockfd_cache_entry *entry; struct sockfd *sfd; bool use_io = nid->io_port ? true : false; const uint8_t *addr = use_io ? nid->io_addr : nid->addr; int fd, idx = -1, port = use_io ? nid->io_port : nid->port; grab: entry = sockfd_cache_grab(nid, &idx); if (!entry) { /* * The node is deleted, but someone askes us to grab it. * The nid is not in the sockfd cache but probably it might be * still alive due to broken network connection or was just too * busy to serve any request that makes other nodes deleted it * from the sockfd cache. In such cases, we need to add it back. */ if (!revalidate_node(nid)) return NULL; goto grab; } check_idx(idx); if (entry->fds[idx].fd != -1) { sd_debug("%s, idx %d", addr_to_str(addr, port), idx); goto out; } /* Create a new cached connection for this node */ sd_debug("create cache connection %s idx %d", addr_to_str(addr, port), idx); fd = connect_to_addr(addr, port); if (fd < 0) { if (use_io) { sd_err("fallback to non-io connection"); fd = connect_to_addr(nid->addr, nid->port); if (fd >= 0) goto new; } uatomic_set_false(&entry->fds[idx].in_use); return NULL; } new: entry->fds[idx].fd = fd; out: sfd = xmalloc(sizeof(*sfd)); sfd->fd = entry->fds[idx].fd; sfd->idx = idx; return sfd; } static void sockfd_cache_put_long(const struct node_id *nid, int idx) { bool use_io = nid->io_port ? true : false; const uint8_t *addr = use_io ? nid->io_addr : nid->addr; int port = use_io ? nid->io_port : nid->port; struct sockfd_cache_entry *entry; sd_debug("%s idx %d", addr_to_str(addr, port), idx); sd_read_lock(&sockfd_cache.lock); entry = sockfd_cache_search(nid); if (entry) uatomic_set_false(&entry->fds[idx].in_use); sd_unlock(&sockfd_cache.lock); } static void sockfd_cache_close(const struct node_id *nid, int idx) { bool use_io = nid->io_port ? true : false; const uint8_t *addr = use_io ? nid->io_addr : nid->addr; int port = use_io ? nid->io_port : nid->port; struct sockfd_cache_entry *entry; sd_debug("%s idx %d", addr_to_str(addr, port), idx); sd_write_lock(&sockfd_cache.lock); entry = sockfd_cache_search(nid); if (entry) { close(entry->fds[idx].fd); entry->fds[idx].fd = -1; uatomic_set_false(&entry->fds[idx].in_use); } sd_unlock(&sockfd_cache.lock); } /* * Create work queue for growing fds. * Before this function called, growing cannot be done. */ int sockfd_init(void) { grow_wq = create_ordered_work_queue("sockfd_grow"); if (!grow_wq) { sd_err("error at creating workqueue for sockfd growth"); return -1; } return 0; } /* * Return a sockfd connected to the node to the caller * * Try to get a 'long' FD as best, which is cached and never closed. If no FD * available, we return a 'short' FD which is supposed to be closed by * sockfd_cache_put(). * * ret_idx is opaque to the caller, -1 indicates it is a short FD. */ struct sockfd *sockfd_cache_get(const struct node_id *nid) { struct sockfd *sfd; int fd; sfd = sockfd_cache_get_long(nid); if (sfd) return sfd; /* Fallback on a non-io connection that is to be closed shortly */ fd = connect_to_addr(nid->addr, nid->port); if (fd < 0) return NULL; sfd = xmalloc(sizeof(*sfd)); sfd->idx = -1; sfd->fd = fd; sd_debug("%d", fd); return sfd; } /* * Release a sockfd connected to the node, which is acquired from * sockfd_cache_get() * * If it is a long FD, just decrease the refcount to make it available again. * If it is a short FD, close it. */ void sockfd_cache_put(const struct node_id *nid, struct sockfd *sfd) { if (sfd->idx == -1) { sd_debug("%d", sfd->fd); close(sfd->fd); free(sfd); return; } sockfd_cache_put_long(nid, sfd->idx); free(sfd); } /* Delete all sockfd connected to the node, when node is crashed. */ void sockfd_cache_del_node(const struct node_id *nid) { int n; if (!sockfd_cache_destroy(nid)) return; n = uatomic_sub_return(&sockfd_cache.count, 1); sd_debug("%s, count %d", addr_to_str(nid->addr, nid->port), n); } /* * Delete a sockfd connected to the node. * * If it is a long FD, de-refcount it and tres to destroy all the cached FDs of * this node in the cache. * If it is a short FD, just close it. */ void sockfd_cache_del(const struct node_id *nid, struct sockfd *sfd) { if (sfd->idx == -1) { sd_debug("%d", sfd->fd); close(sfd->fd); free(sfd); return; } sockfd_cache_close(nid, sfd->idx); sockfd_cache_del_node(nid); free(sfd); } sheepdog-0.7.5/lib/strbuf.c000066400000000000000000000107351223630776600155620ustar00rootroot00000000000000/* * Taken from git by Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "strbuf.h" #include "logger.h" #include "util.h" void strbuf_init(struct strbuf *sb, size_t hint) { memset(sb, 0, sizeof(*sb)); if (hint) strbuf_grow(sb, hint); } void strbuf_release(struct strbuf *sb) { free(sb->buf); memset(sb, 0, sizeof(*sb)); } void strbuf_reset(struct strbuf *sb) { if (sb->len) strbuf_setlen(sb, 0); sb->eof = 0; } char *strbuf_detach(struct strbuf *sb) { char *res = sb->buf; strbuf_init(sb, 0); return res; } void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc) { strbuf_release(sb); sb->buf = buf; sb->len = len; sb->alloc = alloc; strbuf_grow(sb, 0); sb->buf[sb->len] = '\0'; } void strbuf_grow(struct strbuf *sb, size_t extra) { if (unlikely(sb->len + extra + 1 <= sb->len)) panic("you want to use way too much memory"); ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc); } void strbuf_rtrim(struct strbuf *sb) { while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1])) sb->len--; sb->buf[sb->len] = '\0'; } void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len) { strbuf_grow(sb, len); if (unlikely(pos > sb->len)) panic("`pos' is too far after the end of the buffer"); memmove(sb->buf + pos + len, sb->buf + pos, sb->len - pos); memcpy(sb->buf + pos, data, len); strbuf_setlen(sb, sb->len + len); } void strbuf_splice(struct strbuf *sb, size_t pos, size_t len, const void *data, size_t dlen) { if (unlikely(pos + len < pos)) panic("you want to use way too much memory"); if (unlikely(pos > sb->len)) panic("`pos' is too far after the end of the buffer"); if (unlikely(pos + len > sb->len)) panic("`pos + len' is too far after the end of the buffer"); if (dlen >= len) strbuf_grow(sb, dlen - len); memmove(sb->buf + pos + dlen, sb->buf + pos + len, sb->len - pos - len); memcpy(sb->buf + pos, data, dlen); strbuf_setlen(sb, sb->len + dlen - len); } void strbuf_remove(struct strbuf *sb, size_t pos, size_t len) { strbuf_splice(sb, pos, len, NULL, 0); } void strbuf_add(struct strbuf *sb, const void *data, size_t len) { strbuf_grow(sb, len); memcpy(sb->buf + sb->len, data, len); strbuf_setlen(sb, sb->len + len); } void strbuf_addf(struct strbuf *sb, const char *fmt, ...) { int len; va_list ap; va_start(ap, fmt); len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap); va_end(ap); if (len < 0) len = 0; if (len > strbuf_avail(sb)) { strbuf_grow(sb, len); va_start(ap, fmt); len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap); va_end(ap); if (unlikely(len > strbuf_avail(sb))) panic("this should not happen, your snprintf is broken"); } strbuf_setlen(sb, sb->len + len); } size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f) { size_t res; strbuf_grow(sb, size); res = fread(sb->buf + sb->len, 1, size, f); if (res > 0) strbuf_setlen(sb, sb->len + res); return res; } ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint) { size_t oldlen = sb->len; strbuf_grow(sb, hint ? hint : 8192); for (;;) { ssize_t cnt; cnt = xread(fd, sb->buf + sb->len, sb->alloc - sb->len - 1); if (cnt < 0) { strbuf_setlen(sb, oldlen); return -1; } if (!cnt) break; sb->len += cnt; strbuf_grow(sb, 8192); } sb->buf[sb->len] = '\0'; return sb->len - oldlen; } static int strbuf_getwholeline(struct strbuf *sb, FILE *fp, int term) { int ch; if (feof(fp)) return EOF; strbuf_reset(sb); while ((ch = fgetc(fp)) != EOF) { strbuf_grow(sb, 1); sb->buf[sb->len++] = ch; if (ch == term) break; } if (ch == EOF && sb->len == 0) return EOF; sb->buf[sb->len] = '\0'; return 0; } int strbuf_getline(struct strbuf *sb, FILE *fp, int term) { if (strbuf_getwholeline(sb, fp, term)) return EOF; if (sb->buf[sb->len-1] == term) strbuf_setlen(sb, sb->len-1); return 0; } int strbuf_copyout(struct strbuf *sb, void *buf, size_t len) { len = min(len, sb->len + 1); memcpy(buf, sb->buf, len); return len; } int strbuf_stripout(struct strbuf *sb, void *buf, size_t len) { len = min(len, sb->len); if (len == 0) goto out; memcpy(buf, sb->buf, len); strbuf_remove(sb, 0, len); out: return len; } sheepdog-0.7.5/lib/util.c000066400000000000000000000372201223630776600152300ustar00rootroot00000000000000/* * Taken and modfied from git by Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util.h" mode_t sd_def_dmode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP; mode_t sd_def_fmode = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; static void do_nothing(size_t size) { } static void (*try_to_free_routine)(size_t size) = do_nothing; try_to_free_t set_try_to_free_routine(try_to_free_t routine) { try_to_free_t old = try_to_free_routine; if (!routine) routine = do_nothing; try_to_free_routine = routine; return old; } void *xmalloc(size_t size) { void *ret = malloc(size); if (unlikely(!ret) && unlikely(!size)) ret = malloc(1); if (unlikely(!ret)) { try_to_free_routine(size); ret = malloc(size); if (!ret && !size) ret = malloc(1); if (!ret) panic("Out of memory"); } return ret; } void *xzalloc(size_t size) { return xcalloc(1, size); } void *xrealloc(void *ptr, size_t size) { void *ret = realloc(ptr, size); if (unlikely(!ret) && unlikely(!size)) ret = realloc(ptr, 1); if (unlikely(!ret)) { try_to_free_routine(size); ret = realloc(ptr, size); if (!ret && !size) ret = realloc(ptr, 1); if (!ret) panic("Out of memory"); } return ret; } void *xcalloc(size_t nmemb, size_t size) { void *ret = calloc(nmemb, size); if (unlikely(!ret) && unlikely(!nmemb || !size)) ret = calloc(1, 1); if (unlikely(!ret)) { try_to_free_routine(nmemb * size); ret = calloc(nmemb, size); if (!ret && (!nmemb || !size)) ret = calloc(1, 1); if (!ret) panic("Out of memory"); } return ret; } void *xvalloc(size_t size) { void *ret = valloc(size); if (unlikely(!ret)) panic("Out of memory"); return ret; } static ssize_t _read(int fd, void *buf, size_t len) { ssize_t nr; while (true) { nr = read(fd, buf, len); if (unlikely(nr < 0) && (errno == EAGAIN || errno == EINTR)) continue; return nr; } } static ssize_t _write(int fd, const void *buf, size_t len) { ssize_t nr; while (true) { nr = write(fd, buf, len); if (unlikely(nr < 0) && (errno == EAGAIN || errno == EINTR)) continue; return nr; } } ssize_t xread(int fd, void *buf, size_t count) { char *p = buf; ssize_t total = 0; while (count > 0) { ssize_t loaded = _read(fd, p, count); if (unlikely(loaded < 0)) return -1; if (unlikely(loaded == 0)) return total; count -= loaded; p += loaded; total += loaded; } return total; } ssize_t xwrite(int fd, const void *buf, size_t count) { const char *p = buf; ssize_t total = 0; while (count > 0) { ssize_t written = _write(fd, p, count); if (unlikely(written < 0)) return -1; if (unlikely(!written)) { errno = ENOSPC; return -1; } count -= written; p += written; total += written; } return total; } static ssize_t _pread(int fd, void *buf, size_t len, off_t offset) { ssize_t nr; while (true) { nr = pread(fd, buf, len, offset); if (unlikely(nr < 0) && (errno == EAGAIN || errno == EINTR)) continue; return nr; } } static ssize_t _pwrite(int fd, const void *buf, size_t len, off_t offset) { ssize_t nr; while (true) { nr = pwrite(fd, buf, len, offset); if (unlikely(nr < 0) && (errno == EAGAIN || errno == EINTR)) continue; return nr; } } ssize_t xpread(int fd, void *buf, size_t count, off_t offset) { char *p = buf; ssize_t total = 0; while (count > 0) { ssize_t loaded = _pread(fd, p, count, offset); if (unlikely(loaded < 0)) return -1; if (unlikely(loaded == 0)) return total; count -= loaded; p += loaded; total += loaded; offset += loaded; } return total; } ssize_t xpwrite(int fd, const void *buf, size_t count, off_t offset) { const char *p = buf; ssize_t total = 0; while (count > 0) { ssize_t written = _pwrite(fd, p, count, offset); if (unlikely(written < 0)) return -1; if (unlikely(!written)) { errno = ENOSPC; return -1; } count -= written; p += written; total += written; offset += written; } return total; } /* Return EEXIST when path exists but not a directory */ int xmkdir(const char *pathname, mode_t mode) { if (mkdir(pathname, mode) < 0) { struct stat st; if (errno != EEXIST) return -1; if (stat(pathname, &st) < 0) return -1; if (!S_ISDIR(st.st_mode)) { errno = EEXIST; return -1; } } return 0; } int xfallocate(int fd, int mode, off_t offset, off_t len) { int ret; do { ret = fallocate(fd, mode, offset, len); } while (unlikely(ret < 0) && (errno == EAGAIN || errno == EINTR)); return ret; } int xftruncate(int fd, off_t length) { int ret; do { ret = ftruncate(fd, length); } while (unlikely(ret < 0) && (errno == EAGAIN || errno == EINTR)); return ret; } /* * Return the read value on success, or -1 if efd has been made nonblocking and * errno is EAGAIN. If efd has been marked blocking or the eventfd counter is * not zero, this function doesn't return error. */ int eventfd_xread(int efd) { int ret; eventfd_t value = 0; do { ret = eventfd_read(efd, &value); } while (unlikely(ret < 0) && errno == EINTR); if (ret == 0) ret = value; else if (unlikely(errno != EAGAIN)) panic("eventfd_read() failed, %m"); return ret; } void eventfd_xwrite(int efd, int value) { int ret; do { ret = eventfd_write(efd, (eventfd_t)value); } while (unlikely(ret < 0) && (errno == EINTR || errno == EAGAIN)); if (unlikely(ret < 0)) panic("eventfd_write() failed, %m"); } /* * Copy the string str to buf. If str length is bigger than buf_size - * 1 then it is clamped to buf_size - 1. * NOTE: this function does what strncpy should have done to be * useful. NEVER use strncpy. * * @param buf destination buffer * @param buf_size size of destination buffer * @param str source string */ void pstrcpy(char *buf, int buf_size, const char *str) { int c; char *q = buf; if (buf_size <= 0) return; while (true) { c = *str++; if (c == 0 || q >= buf + buf_size - 1) break; *q++ = c; } *q = '\0'; } /* Purge directory recursively */ int purge_directory(char *dir_path) { int ret = 0; struct stat s; DIR *dir; struct dirent *d; char path[PATH_MAX]; dir = opendir(dir_path); if (!dir) { if (errno != ENOENT) sd_err("failed to open %s: %m", dir_path); return -errno; } while ((d = readdir(dir))) { if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, "..")) continue; snprintf(path, sizeof(path), "%s/%s", dir_path, d->d_name); ret = stat(path, &s); if (ret) { sd_err("failed to stat %s: %m", path); goto out; } if (S_ISDIR(s.st_mode)) ret = rmdir_r(path); else ret = unlink(path); if (ret != 0) { sd_err("failed to remove %s %s: %m", S_ISDIR(s.st_mode) ? "directory" : "file", path); goto out; } } out: closedir(dir); return ret; } /* remove directory recursively */ int rmdir_r(char *dir_path) { int ret; ret = purge_directory(dir_path); if (ret == 0) ret = rmdir(dir_path); return ret; } /* * Find zero blocks from the beginning and end of buffer * * The caller passes the offset of 'buf' with 'poffset' so that this funciton * can align the return values to BLOCK_SIZE. 'plen' points the length of the * buffer. If there are zero blocks at the beginning of the buffer, this * function increases the offset and decreases the length on condition that * '*poffset' is block-aligned. If there are zero blocks at the end of the * buffer, this function also decreases the length on condition that '*plen' is * block-aligned. */ void find_zero_blocks(const void *buf, uint64_t *poffset, uint32_t *plen) { const uint8_t zero[BLOCK_SIZE] = {0}; const uint8_t *p = buf; uint64_t start = *poffset; uint64_t offset = 0; uint32_t len = *plen; /* trim zero blocks from the beginning of buffer */ while (len >= BLOCK_SIZE) { size_t size = BLOCK_SIZE - (start + offset) % BLOCK_SIZE; if (memcmp(p + offset, zero, size) != 0) break; offset += size; len -= size; } /* trim zero sectors from the end of buffer */ while (len >= BLOCK_SIZE) { size_t size = (start + offset + len) % BLOCK_SIZE; if (size == 0) size = BLOCK_SIZE; if (memcmp(p + offset + len - size, zero, size) != 0) break; len -= size; } *plen = len; *poffset = start + offset; } /* * Trim zero blocks from the beginning and end of buffer * * This function is similar to find_zero_blocks(), but this updates 'buf' so * that the zero block are removed from the beginning of buffer. */ void trim_zero_blocks(void *buf, uint64_t *poffset, uint32_t *plen) { uint8_t *p = buf; uint64_t orig_offset = *poffset; find_zero_blocks(buf, poffset, plen); if (orig_offset < *poffset) memmove(p, p + *poffset - orig_offset, *plen); } /* * Untrim zero blocks to the beginning and end of buffer * * 'offset' is the offset of 'buf' in the original buffer, 'len' is the length * of 'buf', and 'requested_len' is the length of the original buffer. 'buf' * must have enough spaces to contain 'requested_len' bytes. */ void untrim_zero_blocks(void *buf, uint64_t offset, uint32_t len, uint32_t requested_len) { uint8_t *p = buf; if (offset > 0) { memmove(p + offset, buf, len); memset(p, 0, offset); } if (offset + len < requested_len) memset(p + offset + len, 0, requested_len - offset - len); } bool is_numeric(const char *s) { const char *p = s; if (*p) { char c; while ((c = *p++)) if (!isdigit(c)) return false; return true; } return false; } /* * If 'once' is true, the signal will be restored to the default state * after 'handler' is called. */ int install_sighandler(int signum, void (*handler)(int), bool once) { struct sigaction sa = {}; sa.sa_handler = handler; if (once) sa.sa_flags = SA_RESETHAND | SA_NODEFER; sigemptyset(&sa.sa_mask); return sigaction(signum, &sa, NULL); } int install_crash_handler(void (*handler)(int)) { return install_sighandler(SIGSEGV, handler, true) || install_sighandler(SIGABRT, handler, true) || install_sighandler(SIGBUS, handler, true) || install_sighandler(SIGILL, handler, true) || install_sighandler(SIGFPE, handler, true); } /* * Re-raise the signal 'signo' for the default signal handler to dump * a core file, and exit with 'status' if the default handler cannot * terminate the process. This function is expected to be called in * the installed signal handlers with install_crash_handler(). */ void reraise_crash_signal(int signo, int status) { int ret = raise(signo); /* We won't get here normally. */ if (ret != 0) sd_emerg("failed to re-raise signal %d (%s).", signo, strsignal(signo)); else sd_emerg("default handler for the re-raised " "signal %d (%s) didn't work expectedly", signo, strsignal(signo)); exit(status); } pid_t gettid(void) { return syscall(SYS_gettid); } int tkill(int tid, int sig) { return syscall(SYS_tgkill, getpid(), tid, sig); } bool is_xattr_enabled(const char *path) { int ret, dummy; ret = getxattr(path, "user.dummy", &dummy, sizeof(dummy)); return !(ret == -1 && errno == ENOTSUP); } /* * If force_create is true, this function create the file even when the * temporary file exists. */ int atomic_create_and_write(const char *path, char *buf, size_t len, bool force_create) { int fd, ret; char tmp_path[PATH_MAX]; snprintf(tmp_path, PATH_MAX, "%s.tmp", path); again: fd = open(tmp_path, O_WRONLY | O_CREAT | O_SYNC | O_EXCL, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { if (force_create) { sd_debug("clean up a temporary file %s", tmp_path); unlink(tmp_path); goto again; } else sd_debug("someone else is dealing with %s", tmp_path); } else sd_err("failed to open temporal file %s, %m", tmp_path); ret = -1; goto end; } ret = xwrite(fd, buf, len); if (unlikely(ret != len)) { sd_err("failed to write %s, %m", path); ret = -1; goto close_fd; } ret = rename(tmp_path, path); if (unlikely(ret < 0)) { sd_err("failed to rename %s, %m", path); ret = -1; } close_fd: close(fd); end: return ret; } /* * Returns a list organized in an intermediate format suited * to chaining of merge() calls: null-terminated, no reserved or * sentinel head node, "prev" links not maintained. */ static struct list_head *merge(void *priv, int (*cmp)(void *priv, struct list_head *a, struct list_head *b), struct list_head *a, struct list_head *b) { struct list_head head, *tail = &head; while (a && b) { /* if equal, take 'a' -- important for sort stability */ if ((*cmp)(priv, a, b) <= 0) { tail->next = a; a = a->next; } else { tail->next = b; b = b->next; } tail = tail->next; } tail->next = a?:b; return head.next; } /* * Combine final list merge with restoration of standard doubly-linked * list structure. This approach duplicates code from merge(), but * runs faster than the tidier alternatives of either a separate final * prev-link restoration pass, or maintaining the prev links * throughout. */ static void merge_and_restore_back_links(void *priv, int (*cmp)(void *priv, struct list_head *a, struct list_head *b), struct list_head *head, struct list_head *a, struct list_head *b) { struct list_head *tail = head; while (a && b) { /* if equal, take 'a' -- important for sort stability */ if ((*cmp)(priv, a, b) <= 0) { tail->next = a; a->prev = tail; a = a->next; } else { tail->next = b; b->prev = tail; b = b->next; } tail = tail->next; } tail->next = a ? : b; do { /* * In worst cases this loop may run many iterations. * Continue callbacks to the client even though no * element comparison is needed, so the client's cmp() * routine can invoke cond_resched() periodically. */ (*cmp)(priv, tail->next, tail->next); tail->next->prev = tail; tail = tail->next; } while (tail->next); tail->next = head; head->prev = tail; } /* * list_sort - sort a list * @priv: private data, opaque to list_sort(), passed to @cmp * @head: the list to sort * @cmp: the elements comparison function * * This function implements "merge sort", which has O(nlog(n)) * complexity. * * The comparison function @cmp must return a negative value if @a * should sort before @b, and a positive value if @a should sort after * @b. If @a and @b are equivalent, and their original relative * ordering is to be preserved, @cmp must return 0. */ void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)) { /* sorted partial lists -- last slot is a sentinel */ #define MAX_LIST_LENGTH_BITS 20 struct list_head *part[MAX_LIST_LENGTH_BITS+1]; int lev; /* index into part[] */ int max_lev = 0; struct list_head *list; if (list_empty(head)) return; memset(part, 0, sizeof(part)); head->prev->next = NULL; list = head->next; while (list) { struct list_head *cur = list; list = list->next; cur->next = NULL; for (lev = 0; part[lev]; lev++) { cur = merge(priv, cmp, part[lev], cur); part[lev] = NULL; } if (lev > max_lev) { if (unlikely(lev >= ARRAY_SIZE(part)-1)) { /* * list passed to list_sort() too long for * efficiency */ lev--; } max_lev = lev; } part[lev] = cur; } for (lev = 0; lev < max_lev; lev++) if (part[lev]) list = merge(priv, cmp, part[lev], list); merge_and_restore_back_links(priv, cmp, head, part[max_lev], list); } sheepdog-0.7.5/lib/work.c000066400000000000000000000231551223630776600152370ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * This code is based on bs.c from Linux target framework (tgt): * Copyright (C) 2007 FUJITA Tomonori * Copyright (C) 2007 Mike Christie */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "list.h" #include "util.h" #include "bitops.h" #include "work.h" #include "event.h" #define TID_MAX_DEFAULT 0x8000 /* default maximum tid for most systems */ static size_t tid_max; static unsigned long *tid_map; static int resume_efd; static int ack_efd; /* * The protection period from shrinking work queue. This is necessary * to avoid many calls of pthread_create. Without it, threads are * frequently created and deleted and it leads poor performance. */ #define WQ_PROTECTION_PERIOD 1000 /* ms */ struct worker_info { const char *name; struct list_head finished_list; struct list_head worker_info_siblings; pthread_mutex_t finished_lock; pthread_mutex_t startup_lock; /* wokers sleep on this and signaled by tgtd */ pthread_cond_t pending_cond; /* locked by tgtd and workers */ pthread_mutex_t pending_lock; /* protected by pending_lock */ struct work_queue q; size_t nr_threads; /* protected by uatomic primitives */ size_t nr_workers; /* we cannot shrink work queue till this time */ uint64_t tm_end_of_protection; enum wq_thread_control tc; }; static int efd; static LIST_HEAD(worker_info_list); static size_t nr_nodes = 1; static size_t (*wq_get_nr_nodes)(void); static void *worker_routine(void *arg); static uint64_t get_msec_time(void) { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec * 1000 + tv.tv_usec / 1000; } static inline uint64_t wq_get_roof(struct worker_info *wi) { uint64_t nr = 1; switch (wi->tc) { case WQ_ORDERED: break; case WQ_DYNAMIC: /* FIXME: 2 * nr_nodes threads. No rationale yet. */ nr = nr_nodes * 2; break; case WQ_UNLIMITED: nr = SIZE_MAX; break; default: panic("Invalid threads control %d", wi->tc); } return nr; } static bool wq_need_grow(struct worker_info *wi) { if (wi->nr_threads < uatomic_read(&wi->nr_workers) && wi->nr_threads * 2 <= wq_get_roof(wi)) { wi->tm_end_of_protection = get_msec_time() + WQ_PROTECTION_PERIOD; return true; } return false; } /* * Return true if more than half of threads are not used more than * WQ_PROTECTION_PERIOD seconds */ static bool wq_need_shrink(struct worker_info *wi) { if (uatomic_read(&wi->nr_workers) < wi->nr_threads / 2) /* we cannot shrink work queue during protection period. */ return wi->tm_end_of_protection <= get_msec_time(); /* update the end of protection time */ wi->tm_end_of_protection = get_msec_time() + WQ_PROTECTION_PERIOD; return false; } static int create_worker_threads(struct worker_info *wi, size_t nr_threads) { pthread_t thread; int ret; pthread_mutex_lock(&wi->startup_lock); while (wi->nr_threads < nr_threads) { ret = pthread_create(&thread, NULL, worker_routine, wi); if (ret != 0) { sd_err("failed to create worker thread: %m"); pthread_mutex_unlock(&wi->startup_lock); return -1; } wi->nr_threads++; sd_debug("create thread %s %zu", wi->name, wi->nr_threads); } pthread_mutex_unlock(&wi->startup_lock); return 0; } void suspend_worker_threads(void) { struct worker_info *wi; int tid; list_for_each_entry(wi, &worker_info_list, worker_info_siblings) { pthread_mutex_lock(&wi->pending_lock); } FOR_EACH_BIT(tid, tid_map, tid_max) { if (unlikely(tkill(tid, SIGUSR2) < 0)) panic("%m"); } /* * Wait for all the worker thread to suspend. We cannot use * wi->nr_threads here because some thread may have not called set_bit() * yet (then, the thread doesn't recieve SIGUSR2). */ FOR_EACH_BIT(tid, tid_map, tid_max) { eventfd_xread(ack_efd); } } void resume_worker_threads(void) { struct worker_info *wi; int nr_threads = 0, tid; FOR_EACH_BIT(tid, tid_map, tid_max) { nr_threads++; } eventfd_xwrite(resume_efd, nr_threads); for (int i = 0; i < nr_threads; i++) eventfd_xread(ack_efd); list_for_each_entry(wi, &worker_info_list, worker_info_siblings) { pthread_mutex_unlock(&wi->pending_lock); } } void queue_work(struct work_queue *q, struct work *work) { struct worker_info *wi = container_of(q, struct worker_info, q); uatomic_inc(&wi->nr_workers); pthread_mutex_lock(&wi->pending_lock); if (wq_need_grow(wi)) /* double the thread pool size */ create_worker_threads(wi, wi->nr_threads * 2); list_add_tail(&work->w_list, &wi->q.pending_list); pthread_mutex_unlock(&wi->pending_lock); pthread_cond_signal(&wi->pending_cond); } static void worker_thread_request_done(int fd, int events, void *data) { struct worker_info *wi; struct work *work; LIST_HEAD(list); if (wq_get_nr_nodes) nr_nodes = wq_get_nr_nodes(); eventfd_xread(fd); list_for_each_entry(wi, &worker_info_list, worker_info_siblings) { pthread_mutex_lock(&wi->finished_lock); list_splice_init(&wi->finished_list, &list); pthread_mutex_unlock(&wi->finished_lock); while (!list_empty(&list)) { work = list_first_entry(&list, struct work, w_list); list_del(&work->w_list); work->done(work); uatomic_dec(&wi->nr_workers); } } } static void *worker_routine(void *arg) { struct worker_info *wi = arg; struct work *work; int tid = gettid(); set_thread_name(wi->name, (wi->tc != WQ_ORDERED)); pthread_mutex_lock(&wi->startup_lock); /* started this thread */ pthread_mutex_unlock(&wi->startup_lock); pthread_mutex_lock(&wi->pending_lock); if (tid > tid_max) { size_t old_tid_max = tid_max; /* enlarge bitmap size */ while (tid > tid_max) tid_max *= 2; tid_map = alloc_bitmap(tid_map, old_tid_max, tid_max); } set_bit(tid, tid_map); pthread_mutex_unlock(&wi->pending_lock); while (true) { pthread_mutex_lock(&wi->pending_lock); if (wq_need_shrink(wi)) { wi->nr_threads--; clear_bit(tid, tid_map); pthread_mutex_unlock(&wi->pending_lock); pthread_detach(pthread_self()); sd_debug("destroy thread %s %d, %zu", wi->name, tid, wi->nr_threads); break; } retest: if (list_empty(&wi->q.pending_list)) { pthread_cond_wait(&wi->pending_cond, &wi->pending_lock); goto retest; } work = list_first_entry(&wi->q.pending_list, struct work, w_list); list_del(&work->w_list); pthread_mutex_unlock(&wi->pending_lock); if (work->fn) work->fn(work); pthread_mutex_lock(&wi->finished_lock); list_add_tail(&work->w_list, &wi->finished_list); pthread_mutex_unlock(&wi->finished_lock); eventfd_xwrite(efd, 1); } pthread_exit(NULL); } static void suspend(int num) { int uninitialized_var(value); eventfd_xwrite(ack_efd, 1); /* ack of suspend */ value = eventfd_xread(resume_efd); assert(value == 1); eventfd_xwrite(ack_efd, 1); /* ack of resume */ } int init_work_queue(size_t (*get_nr_nodes)(void)) { int ret; wq_get_nr_nodes = get_nr_nodes; if (wq_get_nr_nodes) nr_nodes = wq_get_nr_nodes(); tid_max = TID_MAX_DEFAULT; tid_map = alloc_bitmap(NULL, 0, tid_max); resume_efd = eventfd(0, EFD_SEMAPHORE); ack_efd = eventfd(0, EFD_SEMAPHORE); efd = eventfd(0, EFD_NONBLOCK); if (resume_efd < 0 || ack_efd < 0 || efd < 0) { sd_err("failed to create event fds: %m"); return 1; } /* trace uses this signal to suspend the worker threads */ if (install_sighandler(SIGUSR2, suspend, false) < 0) { sd_debug("%m"); return -1; } ret = register_event(efd, worker_thread_request_done, NULL); if (ret) { sd_err("failed to register event fd %m"); close(efd); return 1; } return 0; } /* * Allowing unlimited threads to be created is necessary to solve the following * problems: * * 1. timeout of IO requests from guests. With on-demand short threads, we * guarantee that there is always one thread available to execute the * request as soon as possible. * 2. sheep halt for corner case that all gateway and io threads are executing * local requests that ask for creation of another thread to execute the * requests and sleep-wait for responses. */ struct work_queue *create_work_queue(const char *name, enum wq_thread_control tc) { int ret; struct worker_info *wi; wi = xzalloc(sizeof(*wi)); wi->name = name; wi->tc = tc; INIT_LIST_HEAD(&wi->q.pending_list); INIT_LIST_HEAD(&wi->finished_list); pthread_cond_init(&wi->pending_cond, NULL); pthread_mutex_init(&wi->finished_lock, NULL); pthread_mutex_init(&wi->pending_lock, NULL); pthread_mutex_init(&wi->startup_lock, NULL); ret = create_worker_threads(wi, 1); if (ret < 0) goto destroy_threads; list_add(&wi->worker_info_siblings, &worker_info_list); return &wi->q; destroy_threads: pthread_mutex_unlock(&wi->startup_lock); pthread_cond_destroy(&wi->pending_cond); pthread_mutex_destroy(&wi->pending_lock); pthread_mutex_destroy(&wi->startup_lock); pthread_mutex_destroy(&wi->finished_lock); return NULL; } struct work_queue *create_ordered_work_queue(const char *name) { return create_work_queue(name, WQ_ORDERED); } bool work_queue_empty(struct work_queue *q) { struct worker_info *wi = container_of(q, struct worker_info, q); return uatomic_read(&wi->nr_workers) == 0; } sheepdog-0.7.5/man/000077500000000000000000000000001223630776600141105ustar00rootroot00000000000000sheepdog-0.7.5/man/Makefile.am000066400000000000000000000007461223630776600161530ustar00rootroot00000000000000MAINTAINERCLEANFILES = Makefile.in dist_man_MANS = sheep.8 dog.8 if BUILD_SHEEPFS dist_man_MANS += sheepfs.8 endif EXTRA_DIST = sheep.8.in dog.8.in sheepfs.8.in %.8: %.8.in Makefile $(top_srcdir)/script/gen_man.pl $(top_builddir)/%/$* rm -f $@-t $@ @sed \ -e "s#@DATE@#`date '+%Y-%m-%d'`#g" \ -e "s#@OPTIONS@#$(shell $(top_srcdir)/script/gen_man.pl $(top_builddir)/$*/$*)#g" \ $< > $@-t mv $@-t $@ all-local: $(dist_man_MANS) clean-local: rm -rf $(dist_man_MANS) sheepdog-0.7.5/man/dog.8.in000066400000000000000000000021061223630776600153560ustar00rootroot00000000000000.TH SHEEPDOG 8 @DATE@ .SH NAME dog \- Command line utility for the sheep daemon .SH SYNOPSIS .B "dog [options]" .SH DESCRIPTION .B dog - Sheepdog is a distributed storage system for QEMU. It provides highly available block level storage volumes to virtual machines. Sheepdog supports advanced volume management features such as snapshot, cloning, and thin provisioning. The architecture of Sheepdog is fully symmetric; there is no central node such as a meta-data server. The server daemon is called sheep(8). A command line utility is available via dog(8). QEMU virtual machines use the sheep daemon via a block driver available in qemu(1). For more information, run 'dog --help'. .SH COMMAND & SUBCOMMAND @OPTIONS@ .SH DEPENDENCIES \fBSheepdog\fP requires QEMU 0.13.z or later and Corosync 1.y.z or 2.y.z. .SH FILES none .SH SEE ALSO .BR sheep(8), .BR qemu(1), .BR sheepfs(8), .BR corosync_overview(8) .SH AUTHORS This software is developed by the Sheepdog community which may be reached via mailing list at . .PP sheepdog-0.7.5/man/sheep.8.in000066400000000000000000000025251223630776600157160ustar00rootroot00000000000000.TH SHEEPDOG 8 @DATE@ .SH NAME sheep \- Distributed Block Storage System for QEMU .SH SYNOPSIS .B "sheep [options] [PATH]" .SH DESCRIPTION .B sheep - Sheepdog is a distributed storage system for QEMU. It provides highly available block level storage volumes to virtual machines. Sheepdog supports advanced volume management features such as snapshot, cloning, and thin provisioning. The architecture of Sheepdog is fully symmetric; there is no central node such as a meta-data server. The server daemon is called sheep(8). A command line utility is available via dog(8). QEMU virtual machines use the sheep daemon via a block driver available in qemu(1). .SH OPTIONS @OPTIONS@ .SH PATH Proper LSB systems will store sheepdog files in /var/lib/sheepdog. The init script uses this directory by default. The directory must be on a filesystem with xattr support. In the case of ext3, user_xattr should be added to the mount options. mount \-o remount,user_xattr /var/lib/sheepdog .SH DEPENDENCIES \fBsheepdog\fP requires QEMU 0.13.z or later and Corosync 1.y.z. .SH FILES .B /var/lib/sheepdog - Directory containing block storage information .SH SEE ALSO .BR dog(8), .BR qemu(1), .BR sheepfs(8), .BR corosync_overview(8) .SH AUTHORS This software is developed by the sheepdog community which may be reached via mailing list at . .PP sheepdog-0.7.5/man/sheepfs.8.in000066400000000000000000000037751223630776600162570ustar00rootroot00000000000000.TH SHEEPDOG 8 @DATE@ .SH NAME sheepfs \- A pseudo file system exports both Sheepdog's internal state as well as Sheepdog's storage .SH SYNOPSIS .B "sheepfs [OPTION]... MOUNTPOINT" .SH DESCRIPTION .B sheepfs - Sheepdog is a distributed storage system for QEMU. It provides highly available block level storage volumes to virtual machines. Sheepdog supports advanced volume management features such as snapshot, cloning, and thin provisioning. The architecture of Sheepdog is fully symmetric; there is no central node such as a meta-data server. The server daemon is called sheep(8). A command line utility is available via dog(8). A pseudo file system is available via sheepfs(8). QEMU virtual machines use the sheep daemon via a block driver available in qemu(1). Sheepfs is a FUSE-based pseudo file system in userland to access both Sheepdog's internal state (for e.g, cluster info, vdi list) as well as Sheepdog's high reliable storage. The idea here is that its sometimes useful that we can envision our interaction with an Sheepdog's object in terms of a directory structure and filesystem operations. People might be mostly interested into sheepfs's volume directory, which export VM's volume as a pseudo block file in your local file system hierarchy, which can be used as 1. a big file abstraction, which is actually backed by Sheepdog's storage, distributed in the cluster. 2. a loop device file, which you can mount wherever you want to use it as a file system backed up by Sheepdog. 3. a loop device file for some VM's image, which you want to access(RW) its internal data. 4. storage media for other hypervisor, such as XEN This file abstraction integrates well into kernel's pagecache. .SH OPTIONS @OPTIONS@ .SH DEPENDENCIES \fBSheepdog\fP requires QEMU 0.13.z or later and Corosync 1.y.z. .SH FILES none .SH SEE ALSO .BR sheep(8), .BR dog(8), .BR qemu(1), .BR corosync_overview(8) .SH AUTHORS This software is developed by the Sheepdog community which may be reached via mailing list at . .PP sheepdog-0.7.5/script/000077500000000000000000000000001223630776600146415ustar00rootroot00000000000000sheepdog-0.7.5/script/Makefile.am000066400000000000000000000014061223630776600166760ustar00rootroot00000000000000MAINTAINERCLEANFILES = Makefile.in EXTRA_DIST = sheepdog.in noinst_HEADERS = checkarch.sh vditest gen_man.pl gen_bash_completion.pl initscript_SCRIPTS = sheepdog initscriptdir = $(INITDDIR) completion_DATA = dog completiondir = $(sysconfdir)/bash_completion.d dog: gen_bash_completion.pl Makefile rm -f $@-t $@ $(top_srcdir)/script/gen_bash_completion.pl $(top_builddir)/dog/dog > $@-t mv $@-t $@ %: %.in Makefile rm -f $@-t $@ sed \ -e 's#@''SBINDIR@#$(sbindir)#g' \ -e 's#@''SYSCONFDIR@#$(sysconfdir)#g' \ -e 's#@''INITDDIR@#$(INITDDIR)#g' \ -e 's#@''LOCALSTATEDIR@#$(localstatedir)#g' \ $< > $@-t chmod 0755 $@-t mv $@-t $@ all-local: $(initscript_SCRIPTS) $(completion_DATA) clean-local: rm -rf $(initscript_SCRIPTS) $(completion_DATA) sheepdog-0.7.5/script/checkarch.sh000066400000000000000000000004721223630776600171130ustar00rootroot00000000000000#!/bin/sh arch=`gcc -dumpmachine` case $arch in `echo $arch | grep x86_64`) echo -D__SIZEOF_POINTER__=8 -m64 ;; `echo $arch | grep "i[3-6]86"`) echo -D__SIZEOF_POINTER__=4 -m32 ;; *) echo ' Failed to parse your architecture. Please run $ make check32 or $ make check64 manually. ' exit 1 ;; esac sheepdog-0.7.5/script/checkpatch.pl000077500000000000000000002322661223630776600173110ustar00rootroot00000000000000#!/usr/bin/perl -w # (c) 2001, Dave Jones. (the file handling bit) # (c) 2005, Joel Schopp (the ugly bit) # (c) 2007,2008, Andy Whitcroft (new conditions, test suite) # (c) 2008-2010 Andy Whitcroft # Licensed under the terms of the GNU GPL License version 2 use strict; my $P = $0; $P =~ s@.*/@@g; my $V = '0.32'; use Getopt::Long qw(:config no_auto_abbrev); my $quiet = 0; my $tree = 0; my $chk_signoff = 1; my $chk_patch = 1; my $tst_only; my $emacs = 0; my $terse = 0; my $file = 0; my $check = 0; my $summary = 1; my $mailback = 0; my $summary_file = 0; my $show_types = 0; my $root; my %debug; my %ignore_type = (); my @ignore = (); my $help = 0; my $configuration_file = ".checkpatch.conf"; sub help { my ($exitcode) = @_; print << "EOM"; Usage: $P [OPTION]... [FILE]... Version: $V Options: -q, --quiet quiet --no-tree run without a kernel tree --no-signoff do not check for 'Signed-off-by' line --patch treat FILE as patchfile (default) --emacs emacs compile window format --terse one line per report -f, --file treat FILE as regular source file --subjective, --strict enable more subjective tests --ignore TYPE(,TYPE2...) ignore various comma separated message types --show-types show the message "types" in the output --root=PATH PATH to the kernel tree root --no-summary suppress the per-file summary --mailback only produce a report in case of warnings/errors --summary-file include the filename in summary --debug KEY=[0|1] turn on/off debugging of KEY, where KEY is one of 'values', 'possible', 'type', and 'attr' (default is all off) --test-only=WORD report only warnings/errors containing WORD literally -h, --help, --version display this help and exit When FILE is - read standard input. EOM exit($exitcode); } my $conf = which_conf($configuration_file); if (-f $conf) { my @conf_args; open(my $conffile, '<', "$conf") or warn "$P: Can't find a readable $configuration_file file $!\n"; while (<$conffile>) { my $line = $_; $line =~ s/\s*\n?$//g; $line =~ s/^\s*//g; $line =~ s/\s+/ /g; next if ($line =~ m/^\s*#/); next if ($line =~ m/^\s*$/); my @words = split(" ", $line); foreach my $word (@words) { last if ($word =~ m/^#/); push (@conf_args, $word); } } close($conffile); unshift(@ARGV, @conf_args) if @conf_args; } GetOptions( 'q|quiet+' => \$quiet, 'tree!' => \$tree, 'signoff!' => \$chk_signoff, 'patch!' => \$chk_patch, 'emacs!' => \$emacs, 'terse!' => \$terse, 'f|file!' => \$file, 'subjective!' => \$check, 'strict!' => \$check, 'ignore=s' => \@ignore, 'show-types!' => \$show_types, 'root=s' => \$root, 'summary!' => \$summary, 'mailback!' => \$mailback, 'summary-file!' => \$summary_file, 'debug=s' => \%debug, 'test-only=s' => \$tst_only, 'h|help' => \$help, 'version' => \$help ) or help(1); help(0) if ($help); my $exit = 0; if ($#ARGV < 0) { print "$P: no input files\n"; exit(1); } @ignore = split(/,/, join(',',@ignore)); foreach my $word (@ignore) { $word =~ s/\s*\n?$//g; $word =~ s/^\s*//g; $word =~ s/\s+/ /g; $word =~ tr/[a-z]/[A-Z]/; next if ($word =~ m/^\s*#/); next if ($word =~ m/^\s*$/); $ignore_type{$word}++; } my $dbg_values = 0; my $dbg_possible = 0; my $dbg_type = 0; my $dbg_attr = 0; for my $key (keys %debug) { ## no critic eval "\${dbg_$key} = '$debug{$key}';"; die "$@" if ($@); } my $rpt_cleaners = 0; if ($terse) { $emacs = 1; $quiet++; } if ($tree) { if (defined $root) { if (!top_of_kernel_tree($root)) { die "$P: $root: --root does not point at a valid tree\n"; } } else { if (top_of_kernel_tree('.')) { $root = '.'; } elsif ($0 =~ m@(.*)/scripts/[^/]*$@ && top_of_kernel_tree($1)) { $root = $1; } } if (!defined $root) { print "Must be run from the top-level dir. of a kernel tree\n"; exit(2); } } my $emitted_corrupt = 0; our $Ident = qr{ [A-Za-z_][A-Za-z\d_]* (?:\s*\#\#\s*[A-Za-z_][A-Za-z\d_]*)* }x; our $Storage = qr{extern|static|asmlinkage}; our $Sparse = qr{ __user| __kernel| __force| __iomem| __must_check| __init_refok| __kprobes| __ref| __rcu }x; # Notes to $Attribute: # We need \b after 'init' otherwise 'initconst' will cause a false positive in a check our $Attribute = qr{ const| __percpu| __nocast| __safe| __bitwise__| __packed__| __packed2__| __naked| __maybe_unused| __always_unused| __noreturn| __used| __cold| __noclone| __deprecated| __read_mostly| __kprobes| __(?:mem|cpu|dev|)(?:initdata|initconst|init\b)| ____cacheline_aligned| ____cacheline_aligned_in_smp| ____cacheline_internodealigned_in_smp| __weak }x; our $Modifier; our $Inline = qr{inline|__always_inline|noinline}; our $Member = qr{->$Ident|\.$Ident|\[[^]]*\]}; our $Lval = qr{$Ident(?:$Member)*}; our $Constant = qr{(?i:(?:[0-9]+|0x[0-9a-f]+)[ul]*)}; our $Assignment = qr{(?:\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=)}; our $Compare = qr{<=|>=|==|!=|<|>}; our $Operators = qr{ <=|>=|==|!=| =>|->|<<|>>|<|>|!|~| &&|\|\||,|\^|\+\+|--|&|\||\+|-|\*|\/|% }x; our $NonptrType; our $Type; our $Declare; our $NON_ASCII_UTF8 = qr{ [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 }x; our $UTF8 = qr{ [\x09\x0A\x0D\x20-\x7E] # ASCII | $NON_ASCII_UTF8 }x; our $typeTypedefs = qr{(?x: (?:__)?(?:u|s|be|le)(?:8|16|32|64)| atomic_t )}; our $logFunctions = qr{(?x: printk(?:_ratelimited|_once|)| [a-z0-9]+_(?:printk|emerg|alert|crit|err|warning|warn|notice|info|debug|dbg|vdbg|devel|cont|WARN)(?:_ratelimited|_once|)| WARN(?:_RATELIMIT|_ONCE|)| panic| MODULE_[A-Z_]+ )}; our $signature_tags = qr{(?xi: Signed-off-by:| Acked-by:| Tested-by:| Reviewed-by:| Reported-by:| To:| Cc: )}; our @typeList = ( qr{void}, qr{(?:unsigned\s+)?char}, qr{(?:unsigned\s+)?short}, qr{(?:unsigned\s+)?int}, qr{(?:unsigned\s+)?long}, qr{(?:unsigned\s+)?long\s+int}, qr{(?:unsigned\s+)?long\s+long}, qr{(?:unsigned\s+)?long\s+long\s+int}, qr{unsigned}, qr{float}, qr{double}, qr{bool}, qr{struct\s+$Ident}, qr{union\s+$Ident}, qr{enum\s+$Ident}, qr{${Ident}_t}, qr{${Ident}_handler}, qr{${Ident}_handler_fn}, ); our @modifierList = ( qr{fastcall}, ); our $allowed_asm_includes = qr{(?x: irq| memory )}; # memory.h: ARM has a custom one sub build_types { my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)"; my $all = "(?x: \n" . join("|\n ", @typeList) . "\n)"; $Modifier = qr{(?:$Attribute|$Sparse|$mods)}; $NonptrType = qr{ (?:$Modifier\s+|const\s+)* (?: (?:typeof|__typeof__)\s*\([^\)]*\)| (?:$typeTypedefs\b)| (?:${all}\b) ) (?:\s+$Modifier|\s+const)* }x; $Type = qr{ $NonptrType (?:[\s\*]+\s*const|[\s\*]+|(?:\s*\[\s*\])+)? (?:\s+$Inline|\s+$Modifier)* }x; $Declare = qr{(?:$Storage\s+)?$Type}; } build_types(); our $match_balanced_parentheses = qr/(\((?:[^\(\)]+|(-1))*\))/; our $Typecast = qr{\s*(\(\s*$NonptrType\s*\)){0,1}\s*}; our $LvalOrFunc = qr{($Lval)\s*($match_balanced_parentheses{0,1})\s*}; our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant)}; sub deparenthesize { my ($string) = @_; return "" if (!defined($string)); $string =~ s@^\s*\(\s*@@g; $string =~ s@\s*\)\s*$@@g; $string =~ s@\s+@ @g; return $string; } $chk_signoff = 0 if ($file); my @rawlines = (); my @lines = (); my $vname; for my $filename (@ARGV) { my $FILE; if ($file) { open($FILE, '-|', "diff -u /dev/null $filename") || die "$P: $filename: diff failed - $!\n"; } elsif ($filename eq '-') { open($FILE, '<&STDIN'); } else { open($FILE, '<', "$filename") || die "$P: $filename: open failed - $!\n"; } if ($filename eq '-') { $vname = 'Your patch'; } else { $vname = $filename; } while (<$FILE>) { chomp; push(@rawlines, $_); } close($FILE); if (!process($filename)) { $exit = 1; } @rawlines = (); @lines = (); } exit($exit); sub top_of_kernel_tree { my ($root) = @_; my @tree_check = ( "COPYING", "CREDITS", "Kbuild", "MAINTAINERS", "Makefile", "README", "Documentation", "arch", "include", "drivers", "fs", "init", "ipc", "kernel", "lib", "scripts", ); foreach my $check (@tree_check) { if (! -e $root . '/' . $check) { return 0; } } return 1; } sub parse_email { my ($formatted_email) = @_; my $name = ""; my $address = ""; my $comment = ""; if ($formatted_email =~ /^(.*)<(\S+\@\S+)>(.*)$/) { $name = $1; $address = $2; $comment = $3 if defined $3; } elsif ($formatted_email =~ /^\s*<(\S+\@\S+)>(.*)$/) { $address = $1; $comment = $2 if defined $2; } elsif ($formatted_email =~ /(\S+\@\S+)(.*)$/) { $address = $1; $comment = $2 if defined $2; $formatted_email =~ s/$address.*$//; $name = $formatted_email; $name =~ s/^\s+|\s+$//g; $name =~ s/^\"|\"$//g; # If there's a name left after stripping spaces and # leading quotes, and the address doesn't have both # leading and trailing angle brackets, the address # is invalid. ie: # "joe smith joe@smith.com" bad # "joe smith ]+>$/) { $name = ""; $address = ""; $comment = ""; } } $name =~ s/^\s+|\s+$//g; $name =~ s/^\"|\"$//g; $address =~ s/^\s+|\s+$//g; $address =~ s/^\<|\>$//g; if ($name =~ /[^\w \-]/i) { ##has "must quote" chars $name =~ s/(?"; } return $formatted_email; } sub which_conf { my ($conf) = @_; foreach my $path (split(/:/, ".:$ENV{HOME}:.scripts")) { if (-e "$path/$conf") { return "$path/$conf"; } } return ""; } sub expand_tabs { my ($str) = @_; my $res = ''; my $n = 0; for my $c (split(//, $str)) { if ($c eq "\t") { $res .= ' '; $n++; for (; ($n % 8) != 0; $n++) { $res .= ' '; } next; } $res .= $c; $n++; } return $res; } sub copy_spacing { (my $res = shift) =~ tr/\t/ /c; return $res; } sub line_stats { my ($line) = @_; # Drop the diff line leader and expand tabs $line =~ s/^.//; $line = expand_tabs($line); # Pick the indent from the front of the line. my ($white) = ($line =~ /^(\s*)/); return (length($line), length($white)); } my $sanitise_quote = ''; sub sanitise_line_reset { my ($in_comment) = @_; if ($in_comment) { $sanitise_quote = '*/'; } else { $sanitise_quote = ''; } } sub sanitise_line { my ($line) = @_; my $res = ''; my $l = ''; my $qlen = 0; my $off = 0; my $c; # Always copy over the diff marker. $res = substr($line, 0, 1); for ($off = 1; $off < length($line); $off++) { $c = substr($line, $off, 1); # Comments we are wacking completly including the begin # and end, all to $;. if ($sanitise_quote eq '' && substr($line, $off, 2) eq '/*') { $sanitise_quote = '*/'; substr($res, $off, 2, "$;$;"); $off++; next; } if ($sanitise_quote eq '*/' && substr($line, $off, 2) eq '*/') { $sanitise_quote = ''; substr($res, $off, 2, "$;$;"); $off++; next; } if ($sanitise_quote eq '' && substr($line, $off, 2) eq '//') { $sanitise_quote = '//'; substr($res, $off, 2, $sanitise_quote); $off++; next; } # A \ in a string means ignore the next character. if (($sanitise_quote eq "'" || $sanitise_quote eq '"') && $c eq "\\") { substr($res, $off, 2, 'XX'); $off++; next; } # Regular quotes. if ($c eq "'" || $c eq '"') { if ($sanitise_quote eq '') { $sanitise_quote = $c; substr($res, $off, 1, $c); next; } elsif ($sanitise_quote eq $c) { $sanitise_quote = ''; } } #print "c<$c> SQ<$sanitise_quote>\n"; if ($off != 0 && $sanitise_quote eq '*/' && $c ne "\t") { substr($res, $off, 1, $;); } elsif ($off != 0 && $sanitise_quote eq '//' && $c ne "\t") { substr($res, $off, 1, $;); } elsif ($off != 0 && $sanitise_quote && $c ne "\t") { substr($res, $off, 1, 'X'); } else { substr($res, $off, 1, $c); } } if ($sanitise_quote eq '//') { $sanitise_quote = ''; } # The pathname on a #include may be surrounded by '<' and '>'. if ($res =~ /^.\s*\#\s*include\s+\<(.*)\>/) { my $clean = 'X' x length($1); $res =~ s@\<.*\>@<$clean>@; # The whole of a #error is a string. } elsif ($res =~ /^.\s*\#\s*(?:error|warning)\s+(.*)\b/) { my $clean = 'X' x length($1); $res =~ s@(\#\s*(?:error|warning)\s+).*@$1$clean@; } return $res; } sub ctx_statement_block { my ($linenr, $remain, $off) = @_; my $line = $linenr - 1; my $blk = ''; my $soff = $off; my $coff = $off - 1; my $coff_set = 0; my $loff = 0; my $type = ''; my $level = 0; my @stack = (); my $p; my $c; my $len = 0; my $remainder; while (1) { @stack = (['', 0]) if ($#stack == -1); #warn "CSB: blk<$blk> remain<$remain>\n"; # If we are about to drop off the end, pull in more # context. if ($off >= $len) { for (; $remain > 0; $line++) { last if (!defined $lines[$line]); next if ($lines[$line] =~ /^-/); $remain--; $loff = $len; $blk .= $lines[$line] . "\n"; $len = length($blk); $line++; last; } # Bail if there is no further context. #warn "CSB: blk<$blk> off<$off> len<$len>\n"; if ($off >= $len) { last; } if ($level == 0 && substr($blk, $off) =~ /^.\s*#\s*define/) { $level++; $type = '#'; } } $p = $c; $c = substr($blk, $off, 1); $remainder = substr($blk, $off); #warn "CSB: c<$c> type<$type> level<$level> remainder<$remainder> coff_set<$coff_set>\n"; # Handle nested #if/#else. if ($remainder =~ /^#\s*(?:ifndef|ifdef|if)\s/) { push(@stack, [ $type, $level ]); } elsif ($remainder =~ /^#\s*(?:else|elif)\b/) { ($type, $level) = @{$stack[$#stack - 1]}; } elsif ($remainder =~ /^#\s*endif\b/) { ($type, $level) = @{pop(@stack)}; } # Statement ends at the ';' or a close '}' at the # outermost level. if ($level == 0 && $c eq ';') { last; } # An else is really a conditional as long as its not else if if ($level == 0 && $coff_set == 0 && (!defined($p) || $p =~ /(?:\s|\}|\+)/) && $remainder =~ /^(else)(?:\s|{)/ && $remainder !~ /^else\s+if\b/) { $coff = $off + length($1) - 1; $coff_set = 1; #warn "CSB: mark coff<$coff> soff<$soff> 1<$1>\n"; #warn "[" . substr($blk, $soff, $coff - $soff + 1) . "]\n"; } if (($type eq '' || $type eq '(') && $c eq '(') { $level++; $type = '('; } if ($type eq '(' && $c eq ')') { $level--; $type = ($level != 0)? '(' : ''; if ($level == 0 && $coff < $soff) { $coff = $off; $coff_set = 1; #warn "CSB: mark coff<$coff>\n"; } } if (($type eq '' || $type eq '{') && $c eq '{') { $level++; $type = '{'; } if ($type eq '{' && $c eq '}') { $level--; $type = ($level != 0)? '{' : ''; if ($level == 0) { if (substr($blk, $off + 1, 1) eq ';') { $off++; } last; } } # Preprocessor commands end at the newline unless escaped. if ($type eq '#' && $c eq "\n" && $p ne "\\") { $level--; $type = ''; $off++; last; } $off++; } # We are truly at the end, so shuffle to the next line. if ($off == $len) { $loff = $len + 1; $line++; $remain--; } my $statement = substr($blk, $soff, $off - $soff + 1); my $condition = substr($blk, $soff, $coff - $soff + 1); #warn "STATEMENT<$statement>\n"; #warn "CONDITION<$condition>\n"; #print "coff<$coff> soff<$off> loff<$loff>\n"; return ($statement, $condition, $line, $remain + 1, $off - $loff + 1, $level); } sub statement_lines { my ($stmt) = @_; # Strip the diff line prefixes and rip blank lines at start and end. $stmt =~ s/(^|\n)./$1/g; $stmt =~ s/^\s*//; $stmt =~ s/\s*$//; my @stmt_lines = ($stmt =~ /\n/g); return $#stmt_lines + 2; } sub statement_rawlines { my ($stmt) = @_; my @stmt_lines = ($stmt =~ /\n/g); return $#stmt_lines + 2; } sub statement_block_size { my ($stmt) = @_; $stmt =~ s/(^|\n)./$1/g; $stmt =~ s/^\s*{//; $stmt =~ s/}\s*$//; $stmt =~ s/^\s*//; $stmt =~ s/\s*$//; my @stmt_lines = ($stmt =~ /\n/g); my @stmt_statements = ($stmt =~ /;/g); my $stmt_lines = $#stmt_lines + 2; my $stmt_statements = $#stmt_statements + 1; if ($stmt_lines > $stmt_statements) { return $stmt_lines; } else { return $stmt_statements; } } sub ctx_statement_full { my ($linenr, $remain, $off) = @_; my ($statement, $condition, $level); my (@chunks); # Grab the first conditional/block pair. ($statement, $condition, $linenr, $remain, $off, $level) = ctx_statement_block($linenr, $remain, $off); #print "F: c<$condition> s<$statement> remain<$remain>\n"; push(@chunks, [ $condition, $statement ]); if (!($remain > 0 && $condition =~ /^\s*(?:\n[+-])?\s*(?:if|else|do)\b/s)) { return ($level, $linenr, @chunks); } # Pull in the following conditional/block pairs and see if they # could continue the statement. for (;;) { ($statement, $condition, $linenr, $remain, $off, $level) = ctx_statement_block($linenr, $remain, $off); #print "C: c<$condition> s<$statement> remain<$remain>\n"; last if (!($remain > 0 && $condition =~ /^(?:\s*\n[+-])*\s*(?:else|do)\b/s)); #print "C: push\n"; push(@chunks, [ $condition, $statement ]); } return ($level, $linenr, @chunks); } sub ctx_block_get { my ($linenr, $remain, $outer, $open, $close, $off) = @_; my $line; my $start = $linenr - 1; my $blk = ''; my @o; my @c; my @res = (); my $level = 0; my @stack = ($level); for ($line = $start; $remain > 0; $line++) { next if ($rawlines[$line] =~ /^-/); $remain--; $blk .= $rawlines[$line]; # Handle nested #if/#else. if ($lines[$line] =~ /^.\s*#\s*(?:ifndef|ifdef|if)\s/) { push(@stack, $level); } elsif ($lines[$line] =~ /^.\s*#\s*(?:else|elif)\b/) { $level = $stack[$#stack - 1]; } elsif ($lines[$line] =~ /^.\s*#\s*endif\b/) { $level = pop(@stack); } foreach my $c (split(//, $lines[$line])) { ##print "C<$c>L<$level><$open$close>O<$off>\n"; if ($off > 0) { $off--; next; } if ($c eq $close && $level > 0) { $level--; last if ($level == 0); } elsif ($c eq $open) { $level++; } } if (!$outer || $level <= 1) { push(@res, $rawlines[$line]); } last if ($level == 0); } return ($level, @res); } sub ctx_block_outer { my ($linenr, $remain) = @_; my ($level, @r) = ctx_block_get($linenr, $remain, 1, '{', '}', 0); return @r; } sub ctx_block { my ($linenr, $remain) = @_; my ($level, @r) = ctx_block_get($linenr, $remain, 0, '{', '}', 0); return @r; } sub ctx_statement { my ($linenr, $remain, $off) = @_; my ($level, @r) = ctx_block_get($linenr, $remain, 0, '(', ')', $off); return @r; } sub ctx_block_level { my ($linenr, $remain) = @_; return ctx_block_get($linenr, $remain, 0, '{', '}', 0); } sub ctx_statement_level { my ($linenr, $remain, $off) = @_; return ctx_block_get($linenr, $remain, 0, '(', ')', $off); } sub ctx_locate_comment { my ($first_line, $end_line) = @_; # Catch a comment on the end of the line itself. my ($current_comment) = ($rawlines[$end_line - 1] =~ m@.*(/\*.*\*/)\s*(?:\\\s*)?$@); return $current_comment if (defined $current_comment); # Look through the context and try and figure out if there is a # comment. my $in_comment = 0; $current_comment = ''; for (my $linenr = $first_line; $linenr < $end_line; $linenr++) { my $line = $rawlines[$linenr - 1]; #warn " $line\n"; if ($linenr == $first_line and $line =~ m@^.\s*\*@) { $in_comment = 1; } if ($line =~ m@/\*@) { $in_comment = 1; } if (!$in_comment && $current_comment ne '') { $current_comment = ''; } $current_comment .= $line . "\n" if ($in_comment); if ($line =~ m@\*/@) { $in_comment = 0; } } chomp($current_comment); return($current_comment); } sub ctx_has_comment { my ($first_line, $end_line) = @_; my $cmt = ctx_locate_comment($first_line, $end_line); ##print "LINE: $rawlines[$end_line - 1 ]\n"; ##print "CMMT: $cmt\n"; return ($cmt ne ''); } sub raw_line { my ($linenr, $cnt) = @_; my $offset = $linenr - 1; $cnt++; my $line; while ($cnt) { $line = $rawlines[$offset++]; next if (defined($line) && $line =~ /^-/); $cnt--; } return $line; } sub cat_vet { my ($vet) = @_; my ($res, $coded); $res = ''; while ($vet =~ /([^[:cntrl:]]*)([[:cntrl:]]|$)/g) { $res .= $1; if ($2 ne '') { $coded = sprintf("^%c", unpack('C', $2) + 64); $res .= $coded; } } $res =~ s/$/\$/; return $res; } my $av_preprocessor = 0; my $av_pending; my @av_paren_type; my $av_pend_colon; sub annotate_reset { $av_preprocessor = 0; $av_pending = '_'; @av_paren_type = ('E'); $av_pend_colon = 'O'; } sub annotate_values { my ($stream, $type) = @_; my $res; my $var = '_' x length($stream); my $cur = $stream; print "$stream\n" if ($dbg_values > 1); while (length($cur)) { @av_paren_type = ('E') if ($#av_paren_type < 0); print " <" . join('', @av_paren_type) . "> <$type> <$av_pending>" if ($dbg_values > 1); if ($cur =~ /^(\s+)/o) { print "WS($1)\n" if ($dbg_values > 1); if ($1 =~ /\n/ && $av_preprocessor) { $type = pop(@av_paren_type); $av_preprocessor = 0; } } elsif ($cur =~ /^(\(\s*$Type\s*)\)/ && $av_pending eq '_') { print "CAST($1)\n" if ($dbg_values > 1); push(@av_paren_type, $type); $type = 'c'; } elsif ($cur =~ /^($Type)\s*(?:$Ident|,|\)|\(|\s*$)/) { print "DECLARE($1)\n" if ($dbg_values > 1); $type = 'T'; } elsif ($cur =~ /^($Modifier)\s*/) { print "MODIFIER($1)\n" if ($dbg_values > 1); $type = 'T'; } elsif ($cur =~ /^(\#\s*define\s*$Ident)(\(?)/o) { print "DEFINE($1,$2)\n" if ($dbg_values > 1); $av_preprocessor = 1; push(@av_paren_type, $type); if ($2 ne '') { $av_pending = 'N'; } $type = 'E'; } elsif ($cur =~ /^(\#\s*(?:undef\s*$Ident|include\b))/o) { print "UNDEF($1)\n" if ($dbg_values > 1); $av_preprocessor = 1; push(@av_paren_type, $type); } elsif ($cur =~ /^(\#\s*(?:ifdef|ifndef|if))/o) { print "PRE_START($1)\n" if ($dbg_values > 1); $av_preprocessor = 1; push(@av_paren_type, $type); push(@av_paren_type, $type); $type = 'E'; } elsif ($cur =~ /^(\#\s*(?:else|elif))/o) { print "PRE_RESTART($1)\n" if ($dbg_values > 1); $av_preprocessor = 1; push(@av_paren_type, $av_paren_type[$#av_paren_type]); $type = 'E'; } elsif ($cur =~ /^(\#\s*(?:endif))/o) { print "PRE_END($1)\n" if ($dbg_values > 1); $av_preprocessor = 1; # Assume all arms of the conditional end as this # one does, and continue as if the #endif was not here. pop(@av_paren_type); push(@av_paren_type, $type); $type = 'E'; } elsif ($cur =~ /^(\\\n)/o) { print "PRECONT($1)\n" if ($dbg_values > 1); } elsif ($cur =~ /^(__attribute__)\s*\(?/o) { print "ATTR($1)\n" if ($dbg_values > 1); $av_pending = $type; $type = 'N'; } elsif ($cur =~ /^(sizeof)\s*(\()?/o) { print "SIZEOF($1)\n" if ($dbg_values > 1); if (defined $2) { $av_pending = 'V'; } $type = 'N'; } elsif ($cur =~ /^(if|while|for)\b/o) { print "COND($1)\n" if ($dbg_values > 1); $av_pending = 'E'; $type = 'N'; } elsif ($cur =~/^(case)/o) { print "CASE($1)\n" if ($dbg_values > 1); $av_pend_colon = 'C'; $type = 'N'; } elsif ($cur =~/^(return|else|goto|typeof|__typeof__)\b/o) { print "KEYWORD($1)\n" if ($dbg_values > 1); $type = 'N'; } elsif ($cur =~ /^(\()/o) { print "PAREN('$1')\n" if ($dbg_values > 1); push(@av_paren_type, $av_pending); $av_pending = '_'; $type = 'N'; } elsif ($cur =~ /^(\))/o) { my $new_type = pop(@av_paren_type); if ($new_type ne '_') { $type = $new_type; print "PAREN('$1') -> $type\n" if ($dbg_values > 1); } else { print "PAREN('$1')\n" if ($dbg_values > 1); } } elsif ($cur =~ /^($Ident)\s*\(/o) { print "FUNC($1)\n" if ($dbg_values > 1); $type = 'V'; $av_pending = 'V'; } elsif ($cur =~ /^($Ident\s*):(?:\s*\d+\s*(,|=|;))?/) { if (defined $2 && $type eq 'C' || $type eq 'T') { $av_pend_colon = 'B'; } elsif ($type eq 'E') { $av_pend_colon = 'L'; } print "IDENT_COLON($1,$type>$av_pend_colon)\n" if ($dbg_values > 1); $type = 'V'; } elsif ($cur =~ /^($Ident|$Constant)/o) { print "IDENT($1)\n" if ($dbg_values > 1); $type = 'V'; } elsif ($cur =~ /^($Assignment)/o) { print "ASSIGN($1)\n" if ($dbg_values > 1); $type = 'N'; } elsif ($cur =~/^(;|{|})/) { print "END($1)\n" if ($dbg_values > 1); $type = 'E'; $av_pend_colon = 'O'; } elsif ($cur =~/^(,)/) { print "COMMA($1)\n" if ($dbg_values > 1); $type = 'C'; } elsif ($cur =~ /^(\?)/o) { print "QUESTION($1)\n" if ($dbg_values > 1); $type = 'N'; } elsif ($cur =~ /^(:)/o) { print "COLON($1,$av_pend_colon)\n" if ($dbg_values > 1); substr($var, length($res), 1, $av_pend_colon); if ($av_pend_colon eq 'C' || $av_pend_colon eq 'L') { $type = 'E'; } else { $type = 'N'; } $av_pend_colon = 'O'; } elsif ($cur =~ /^(\[)/o) { print "CLOSE($1)\n" if ($dbg_values > 1); $type = 'N'; } elsif ($cur =~ /^(-(?![->])|\+(?!\+)|\*|\&\&|\&)/o) { my $variant; print "OPV($1)\n" if ($dbg_values > 1); if ($type eq 'V') { $variant = 'B'; } else { $variant = 'U'; } substr($var, length($res), 1, $variant); $type = 'N'; } elsif ($cur =~ /^($Operators)/o) { print "OP($1)\n" if ($dbg_values > 1); if ($1 ne '++' && $1 ne '--') { $type = 'N'; } } elsif ($cur =~ /(^.)/o) { print "C($1)\n" if ($dbg_values > 1); } if (defined $1) { $cur = substr($cur, length($1)); $res .= $type x length($1); } } return ($res, $var); } sub possible { my ($possible, $line) = @_; my $notPermitted = qr{(?: ^(?: $Modifier| $Storage| $Type| DEFINE_\S+ )$| ^(?: goto| return| case| else| asm|__asm__| do| \#| \#\#| )(?:\s|$)| ^(?:typedef|struct|enum)\b )}x; warn "CHECK<$possible> ($line)\n" if ($dbg_possible > 2); if ($possible !~ $notPermitted) { # Check for modifiers. $possible =~ s/\s*$Storage\s*//g; $possible =~ s/\s*$Sparse\s*//g; if ($possible =~ /^\s*$/) { } elsif ($possible =~ /\s/) { $possible =~ s/\s*$Type\s*//g; for my $modifier (split(' ', $possible)) { if ($modifier !~ $notPermitted) { warn "MODIFIER: $modifier ($possible) ($line)\n" if ($dbg_possible); push(@modifierList, $modifier); } } } else { warn "POSSIBLE: $possible ($line)\n" if ($dbg_possible); push(@typeList, $possible); } build_types(); } else { warn "NOTPOSS: $possible ($line)\n" if ($dbg_possible > 1); } } my $prefix = ''; sub show_type { return !defined $ignore_type{$_[0]}; } sub report { if (!show_type($_[1]) || (defined $tst_only && $_[2] !~ /\Q$tst_only\E/)) { return 0; } my $line; if ($show_types) { $line = "$prefix$_[0]:$_[1]: $_[2]\n"; } else { $line = "$prefix$_[0]: $_[2]\n"; } $line = (split('\n', $line))[0] . "\n" if ($terse); push(our @report, $line); return 1; } sub report_dump { our @report; } sub ERROR { if (report("ERROR", $_[0], $_[1])) { our $clean = 0; our $cnt_error++; } } sub WARN { if (report("WARNING", $_[0], $_[1])) { our $clean = 0; our $cnt_warn++; } } sub CHK { if ($check && report("CHECK", $_[0], $_[1])) { our $clean = 0; our $cnt_chk++; } } sub check_absolute_file { my ($absolute, $herecurr) = @_; my $file = $absolute; ##print "absolute<$absolute>\n"; # See if any suffix of this path is a path within the tree. while ($file =~ s@^[^/]*/@@) { if (-f "$root/$file") { ##print "file<$file>\n"; last; } } if (! -f _) { return 0; } # It is, so see if the prefix is acceptable. my $prefix = $absolute; substr($prefix, -length($file)) = ''; ##print "prefix<$prefix>\n"; if ($prefix ne ".../") { WARN("USE_RELATIVE_PATH", "use relative pathname instead of absolute in changelog text\n" . $herecurr); } } sub process { my $filename = shift; my $linenr=0; my $prevline=""; my $prevrawline=""; my $stashline=""; my $stashrawline=""; my $length; my $indent; my $previndent=0; my $stashindent=0; our $clean = 1; my $signoff = 0; my $is_patch = 0; my $in_header_lines = 1; my $in_commit_log = 0; #Scanning lines before patch our @report = (); our $cnt_lines = 0; our $cnt_error = 0; our $cnt_warn = 0; our $cnt_chk = 0; # Trace the real file/line as we go. my $realfile = ''; my $realline = 0; my $realcnt = 0; my $here = ''; my $in_comment = 0; my $comment_edge = 0; my $first_line = 0; my $p1_prefix = ''; my $prev_values = 'E'; # suppression flags my %suppress_ifbraces; my %suppress_whiletrailers; my %suppress_export; my $suppress_statement = 0; # Pre-scan the patch sanitizing the lines. # Pre-scan the patch looking for any __setup documentation. # my @setup_docs = (); my $setup_docs = 0; sanitise_line_reset(); my $line; foreach my $rawline (@rawlines) { $linenr++; $line = $rawline; if ($rawline=~/^\+\+\+\s+(\S+)/) { $setup_docs = 0; if ($1 =~ m@Documentation/kernel-parameters.txt$@) { $setup_docs = 1; } #next; } if ($rawline=~/^\@\@ -\d+(?:,\d+)? \+(\d+)(,(\d+))? \@\@/) { $realline=$1-1; if (defined $2) { $realcnt=$3+1; } else { $realcnt=1+1; } $in_comment = 0; # Guestimate if this is a continuing comment. Run # the context looking for a comment "edge". If this # edge is a close comment then we must be in a comment # at context start. my $edge; my $cnt = $realcnt; for (my $ln = $linenr + 1; $cnt > 0; $ln++) { next if (defined $rawlines[$ln - 1] && $rawlines[$ln - 1] =~ /^-/); $cnt--; #print "RAW<$rawlines[$ln - 1]>\n"; last if (!defined $rawlines[$ln - 1]); if ($rawlines[$ln - 1] =~ m@(/\*|\*/)@ && $rawlines[$ln - 1] !~ m@"[^"]*(?:/\*|\*/)[^"]*"@) { ($edge) = $1; last; } } if (defined $edge && $edge eq '*/') { $in_comment = 1; } # Guestimate if this is a continuing comment. If this # is the start of a diff block and this line starts # ' *' then it is very likely a comment. if (!defined $edge && $rawlines[$linenr] =~ m@^.\s*(?:\*\*+| \*)(?:\s|$)@) { $in_comment = 1; } ##print "COMMENT:$in_comment edge<$edge> $rawline\n"; sanitise_line_reset($in_comment); } elsif ($realcnt && $rawline =~ /^(?:\+| |$)/) { # Standardise the strings and chars within the input to # simplify matching -- only bother with positive lines. $line = sanitise_line($rawline); } push(@lines, $line); if ($realcnt > 1) { $realcnt-- if ($line =~ /^(?:\+| |$)/); } else { $realcnt = 0; } #print "==>$rawline\n"; #print "-->$line\n"; if ($setup_docs && $line =~ /^\+/) { push(@setup_docs, $line); } } $prefix = ''; $realcnt = 0; $linenr = 0; foreach my $line (@lines) { $linenr++; my $rawline = $rawlines[$linenr - 1]; #extract the line range in the file after the patch is applied if ($line=~/^\@\@ -\d+(?:,\d+)? \+(\d+)(,(\d+))? \@\@/) { $is_patch = 1; $first_line = $linenr + 1; $realline=$1-1; if (defined $2) { $realcnt=$3+1; } else { $realcnt=1+1; } annotate_reset(); $prev_values = 'E'; %suppress_ifbraces = (); %suppress_whiletrailers = (); %suppress_export = (); $suppress_statement = 0; next; # track the line number as we move through the hunk, note that # new versions of GNU diff omit the leading space on completely # blank context lines so we need to count that too. } elsif ($line =~ /^( |\+|$)/) { $realline++; $realcnt-- if ($realcnt != 0); # Measure the line length and indent. ($length, $indent) = line_stats($rawline); # Track the previous line. ($prevline, $stashline) = ($stashline, $line); ($previndent, $stashindent) = ($stashindent, $indent); ($prevrawline, $stashrawline) = ($stashrawline, $rawline); #warn "line<$line>\n"; } elsif ($realcnt == 1) { $realcnt--; } my $hunk_line = ($realcnt != 0); #make up the handle for any error we report on this line $prefix = "$filename:$realline: " if ($emacs && $file); $prefix = "$filename:$linenr: " if ($emacs && !$file); $here = "#$linenr: " if (!$file); $here = "#$realline: " if ($file); # extract the filename as it passes if ($line =~ /^diff --git.*?(\S+)$/) { $realfile = $1; $realfile =~ s@^([^/]*)/@@; $in_commit_log = 0; } elsif ($line =~ /^\+\+\+\s+(\S+)/) { $realfile = $1; $realfile =~ s@^([^/]*)/@@; $in_commit_log = 0; $p1_prefix = $1; if (!$file && $tree && $p1_prefix ne '' && -e "$root/$p1_prefix") { WARN("PATCH_PREFIX", "patch prefix '$p1_prefix' exists, appears to be a -p0 patch\n"); } if ($realfile =~ m@^include/asm/@) { ERROR("MODIFIED_INCLUDE_ASM", "do not modify files in include/asm, change architecture specific files in include/asm-\n" . "$here$rawline\n"); } next; } $here .= "FILE: $realfile:$realline:" if ($realcnt != 0); my $hereline = "$here\n$rawline\n"; my $herecurr = "$here\n$rawline\n"; my $hereprev = "$here\n$prevrawline\n$rawline\n"; $cnt_lines++ if ($realcnt != 0); # Check for incorrect file permissions if ($line =~ /^new (file )?mode.*[7531]\d{0,2}$/) { my $permhere = $here . "FILE: $realfile\n"; if ($realfile =~ /(Makefile|Kconfig|\.c|\.h|\.S|\.tmpl)$/) { ERROR("EXECUTE_PERMISSIONS", "do not set execute permissions for source files\n" . $permhere); } } # Check the patch for a signoff: if ($line =~ /^\s*signed-off-by:/i) { $signoff++; $in_commit_log = 0; } # Check signature styles if (!$in_header_lines && $line =~ /^(\s*)($signature_tags)(\s*)(.*)/) { my $space_before = $1; my $sign_off = $2; my $space_after = $3; my $email = $4; my $ucfirst_sign_off = ucfirst(lc($sign_off)); if (defined $space_before && $space_before ne "") { WARN("BAD_SIGN_OFF", "Do not use whitespace before $ucfirst_sign_off\n" . $herecurr); } if ($sign_off =~ /-by:$/i && $sign_off ne $ucfirst_sign_off) { WARN("BAD_SIGN_OFF", "'$ucfirst_sign_off' is the preferred signature form\n" . $herecurr); } if (!defined $space_after || $space_after ne " ") { WARN("BAD_SIGN_OFF", "Use a single space after $ucfirst_sign_off\n" . $herecurr); } my ($email_name, $email_address, $comment) = parse_email($email); my $suggested_email = format_email(($email_name, $email_address)); if ($suggested_email eq "") { ERROR("BAD_SIGN_OFF", "Unrecognized email address: '$email'\n" . $herecurr); } else { my $dequoted = $suggested_email; $dequoted =~ s/^"//; $dequoted =~ s/" $comment" ne $email && "$suggested_email$comment" ne $email) { WARN("BAD_SIGN_OFF", "email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr); } } } # Check for wrappage within a valid hunk of the file if ($realcnt != 0 && $line !~ m{^(?:\+|-| |\\ No newline|$)}) { ERROR("CORRUPTED_PATCH", "patch seems to be corrupt (line wrapped?)\n" . $herecurr) if (!$emitted_corrupt++); } # Check for absolute kernel paths. if ($tree) { while ($line =~ m{(?:^|\s)(/\S*)}g) { my $file = $1; if ($file =~ m{^(.*?)(?::\d+)+:?$} && check_absolute_file($1, $herecurr)) { # } else { check_absolute_file($file, $herecurr); } } } # UTF-8 regex found at http://www.w3.org/International/questions/qa-forms-utf-8.en.php if (($realfile =~ /^$/ || $line =~ /^\+/) && $rawline !~ m/^$UTF8*$/) { my ($utf8_prefix) = ($rawline =~ /^($UTF8*)/); my $blank = copy_spacing($rawline); my $ptr = substr($blank, 0, length($utf8_prefix)) . "^"; my $hereptr = "$hereline$ptr\n"; CHK("INVALID_UTF8", "Invalid UTF-8, patch and commit message should be encoded in UTF-8\n" . $hereptr); } # Check if it's the start of a commit log # (not a header line and we haven't seen the patch filename) if ($in_header_lines && $realfile =~ /^$/ && $rawline !~ /^(commit\b|from\b|[\w-]+:).+$/i) { $in_header_lines = 0; $in_commit_log = 1; } # Still not yet in a patch, check for any UTF-8 if ($in_commit_log && $realfile =~ /^$/ && $rawline =~ /$NON_ASCII_UTF8/) { CHK("UTF8_BEFORE_PATCH", "8-bit UTF-8 used in possible commit log\n" . $herecurr); } # ignore non-hunk lines and lines being removed next if (!$hunk_line || $line =~ /^-/); #trailing whitespace if ($line =~ /^\+.*\015/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; ERROR("DOS_LINE_ENDINGS", "DOS line endings\n" . $herevet); } elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; ERROR("TRAILING_WHITESPACE", "trailing whitespace\n" . $herevet); $rpt_cleaners = 1; } # check we are in a valid source file if not then ignore this hunk next if ($realfile !~ /\.(h|c|s|S|pl|sh)$/); #80 column limit if ($line =~ /^\+/ && $prevrawline !~ /\/\*\*/ && $rawline !~ /^.\s*\*\s*\@$Ident\s/ && !($line =~ /^\+\s*$logFunctions\s*\(\s*(?:(KERN_\S+\s*|[^"]*))?"[X\t]*"\s*(?:|,|\)\s*;)\s*$/ || $line =~ /^\+\s*"[^"]*"\s*(?:\s*|,|\)\s*;)\s*$/) && $length > 80) { WARN("LONG_LINE", "line over 80 characters\n" . $herecurr); } # check for spaces before a quoted newline if ($rawline =~ /^.*\".*\s\\n/) { WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE", "unnecessary whitespace before a quoted newline\n" . $herecurr); } # check for adding lines without a newline. if ($line =~ /^\+/ && defined $lines[$linenr] && $lines[$linenr] =~ /^\\ No newline at end of file/) { WARN("MISSING_EOF_NEWLINE", "adding a line without newline at end of file\n" . $herecurr); } # check we are in a valid source file C or perl if not then ignore this hunk next if ($realfile !~ /\.(h|c|pl)$/); # at the beginning of a line any tabs must come first and anything # more than 8 must use tabs. if ($rawline =~ /^\+\s* \t\s*\S/ || $rawline =~ /^\+\s* \s*/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; ERROR("CODE_INDENT", "code indent should use tabs where possible\n" . $herevet); $rpt_cleaners = 1; } # check for space before tabs. if ($rawline =~ /^\+/ && $rawline =~ / \t/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; WARN("SPACE_BEFORE_TAB", "please, no space before tabs\n" . $herevet); } # check for block comment. # # A: # /* foo # * bar */ # B: # /* # * foo # * bar */ # C: # /* # * one-liner # */ # D: # /* one-liner # */ # E: # /* foo # * bar # * baz # */ # above is not preferred # # /* # * This block comments style # * is preferred # */ if ($line =~ /^\+/ && $rawline =~ /\*\/$/ && $rawline !~ /\/\*/) { if ($rawline !~ /^\+\s*\*\/$/) { # case A and B WARN("BLOCK_COMMENT_STYLE", "[BCS] put the trailing */ on a separate line\n" . $hereprev); } elsif ($prevrawline =~ /^\+\s*\/\*/ || $rawlines[$linenr - 3] =~ /^\+\s*\/\*/) { # case C and D WARN("BLOCK_COMMENT_STYLE", "[BCS] don't use block comments for one liner comment\n" . $hereprev); } else { # case E my $ln = $linenr; while ($rawlines[$ln] !~ /^\+\s*\/\*/ && $ln >= 0) { $ln--; } if ($rawlines[$ln] =~ /^\+\s*\/\*./) { WARN("BLOCK_COMMENT_STYLE", "[BCS] don't comment at first line in block comments\n" . $hereprev); } } } # check for spaces at the beginning of a line. # Exceptions: # 1) within comments # 2) indented preprocessor commands # 3) hanging labels if ($rawline =~ /^\+ / && $line !~ /\+ *(?:$;|#|$Ident:)/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; WARN("LEADING_SPACE", "please, no spaces at the start of a line\n" . $herevet); } # check we are in a valid C source file if not then ignore this hunk next if ($realfile !~ /\.(h|c)$/); # check for RCS/CVS revision markers if ($rawline =~ /^\+.*\$(Revision|Log|Id)(?:\$|)/) { WARN("CVS_KEYWORD", "CVS style keyword markers, these will _not_ be updated\n". $herecurr); } # Check for potential 'bare' types my ($stat, $cond, $line_nr_next, $remain_next, $off_next, $realline_next); #print "LINE<$line>\n"; if ($linenr >= $suppress_statement && $realcnt && $line =~ /.\s*\S/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0); $stat =~ s/\n./\n /g; $cond =~ s/\n./\n /g; #print "linenr<$linenr> <$stat>\n"; # If this statement has no statement boundaries within # it there is no point in retrying a statement scan # until we hit end of it. my $frag = $stat; $frag =~ s/;+\s*$//; if ($frag !~ /(?:{|;)/) { #print "skip<$line_nr_next>\n"; $suppress_statement = $line_nr_next; } # Find the real next line. $realline_next = $line_nr_next; if (defined $realline_next && (!defined $lines[$realline_next - 1] || substr($lines[$realline_next - 1], $off_next) =~ /^\s*$/)) { $realline_next++; } my $s = $stat; $s =~ s/{.*$//s; # Ignore goto labels. if ($s =~ /$Ident:\*$/s) { # Ignore functions being called } elsif ($s =~ /^.\s*$Ident\s*\(/s) { } elsif ($s =~ /^.\s*else\b/s) { # declarations always start with types } elsif ($prev_values eq 'E' && $s =~ /^.\s*(?:$Storage\s+)?(?:$Inline\s+)?(?:const\s+)?((?:\s*$Ident)+?)\b(?:\s+$Sparse)?\s*\**\s*(?:$Ident|\(\*[^\)]*\))(?:\s*$Modifier)?\s*(?:;|=|,|\()/s) { my $type = $1; $type =~ s/\s+/ /g; possible($type, "A:" . $s); # definitions in global scope can only start with types } elsif ($s =~ /^.(?:$Storage\s+)?(?:$Inline\s+)?(?:const\s+)?($Ident)\b\s*(?!:)/s) { possible($1, "B:" . $s); } # any (foo ... *) is a pointer cast, and foo is a type while ($s =~ /\(($Ident)(?:\s+$Sparse)*[\s\*]+\s*\)/sg) { possible($1, "C:" . $s); } # Check for any sort of function declaration. # int foo(something bar, other baz); # void (*store_gdt)(x86_descr_ptr *); if ($prev_values eq 'E' && $s =~ /^(.(?:typedef\s*)?(?:(?:$Storage|$Inline)\s*)*\s*$Type\s*(?:\b$Ident|\(\*\s*$Ident\))\s*)\(/s) { my ($name_len) = length($1); my $ctx = $s; substr($ctx, 0, $name_len + 1, ''); $ctx =~ s/\)[^\)]*$//; for my $arg (split(/\s*,\s*/, $ctx)) { if ($arg =~ /^(?:const\s+)?($Ident)(?:\s+$Sparse)*\s*\**\s*(:?\b$Ident)?$/s || $arg =~ /^($Ident)$/s) { possible($1, "D:" . $s); } } } } # # Checks which may be anchored in the context. # # Check for switch () and associated case and default # statements should be at the same indent. if ($line=~/\bswitch\s*\(.*\)/) { my $err = ''; my $sep = ''; my @ctx = ctx_block_outer($linenr, $realcnt); shift(@ctx); for my $ctx (@ctx) { my ($clen, $cindent) = line_stats($ctx); if ($ctx =~ /^\+\s*(case\s+|default:)/ && $indent != $cindent) { $err .= "$sep$ctx\n"; $sep = ''; } else { $sep = "[...]\n"; } } if ($err ne '') { ERROR("SWITCH_CASE_INDENT_LEVEL", "switch and case should be at the same indent\n$hereline$err"); } } # if/while/etc brace do not go on next line, unless defining a do while loop, # or if that brace on the next line is for something else if ($line =~ /(.*)\b((?:if|while|for|switch)\s*\(|do\b|else\b)/ && $line !~ /^.\s*\#/) { my $pre_ctx = "$1$2"; my ($level, @ctx) = ctx_statement_level($linenr, $realcnt, 0); if ($line =~ /^\+\t{6,}/) { WARN("DEEP_INDENTATION", "Too many leading tabs - consider code refactoring\n" . $herecurr); } my $ctx_cnt = $realcnt - $#ctx - 1; my $ctx = join("\n", @ctx); my $ctx_ln = $linenr; my $ctx_skip = $realcnt; while ($ctx_skip > $ctx_cnt || ($ctx_skip == $ctx_cnt && defined $lines[$ctx_ln - 1] && $lines[$ctx_ln - 1] =~ /^-/)) { ##print "SKIP<$ctx_skip> CNT<$ctx_cnt>\n"; $ctx_skip-- if (!defined $lines[$ctx_ln - 1] || $lines[$ctx_ln - 1] !~ /^-/); $ctx_ln++; } #print "realcnt<$realcnt> ctx_cnt<$ctx_cnt>\n"; #print "pre<$pre_ctx>\nline<$line>\nctx<$ctx>\nnext<$lines[$ctx_ln - 1]>\n"; if ($ctx !~ /{\s*/ && defined($lines[$ctx_ln -1]) && $lines[$ctx_ln - 1] =~ /^\+\s*{/) { ERROR("OPEN_BRACE", "that open brace { should be on the previous line\n" . "$here\n$ctx\n$rawlines[$ctx_ln - 1]\n"); } if ($level == 0 && $pre_ctx !~ /}\s*while\s*\($/ && $ctx =~ /\)\s*\;\s*$/ && defined $lines[$ctx_ln - 1]) { my ($nlength, $nindent) = line_stats($lines[$ctx_ln - 1]); if ($nindent > $indent) { WARN("TRAILING_SEMICOLON", "trailing semicolon indicates no statements, indent implies otherwise\n" . "$here\n$ctx\n$rawlines[$ctx_ln - 1]\n"); } } } # Check relative indent for conditionals and blocks. if ($line =~ /\b(?:(?:if|while|for)\s*\(|do\b)/ && $line !~ /^.\s*#/ && $line !~ /\}\s*while\s*/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0) if (!defined $stat); my ($s, $c) = ($stat, $cond); substr($s, 0, length($c), ''); # Make sure we remove the line prefixes as we have # none on the first line, and are going to readd them # where necessary. $s =~ s/\n./\n/gs; # Find out how long the conditional actually is. my @newlines = ($c =~ /\n/gs); my $cond_lines = 1 + $#newlines; # We want to check the first line inside the block # starting at the end of the conditional, so remove: # 1) any blank line termination # 2) any opening brace { on end of the line # 3) any do (...) { my $continuation = 0; my $check = 0; $s =~ s/^.*\bdo\b//; $s =~ s/^\s*{//; if ($s =~ s/^\s*\\//) { $continuation = 1; } if ($s =~ s/^\s*?\n//) { $check = 1; $cond_lines++; } # Also ignore a loop construct at the end of a # preprocessor statement. if (($prevline =~ /^.\s*#\s*define\s/ || $prevline =~ /\\\s*$/) && $continuation == 0) { $check = 0; } my $cond_ptr = -1; $continuation = 0; while ($cond_ptr != $cond_lines) { $cond_ptr = $cond_lines; # If we see an #else/#elif then the code # is not linear. if ($s =~ /^\s*\#\s*(?:else|elif)/) { $check = 0; } # Ignore: # 1) blank lines, they should be at 0, # 2) preprocessor lines, and # 3) labels. if ($continuation || $s =~ /^\s*?\n/ || $s =~ /^\s*#\s*?/ || $s =~ /^\s*$Ident\s*:/) { $continuation = ($s =~ /^.*?\\\n/) ? 1 : 0; if ($s =~ s/^.*?\n//) { $cond_lines++; } } } my (undef, $sindent) = line_stats("+" . $s); my $stat_real = raw_line($linenr, $cond_lines); # Check if either of these lines are modified, else # this is not this patch's fault. if (!defined($stat_real) || $stat !~ /^\+/ && $stat_real !~ /^\+/) { $check = 0; } if (defined($stat_real) && $cond_lines > 1) { $stat_real = "[...]\n$stat_real"; } #print "line<$line> prevline<$prevline> indent<$indent> sindent<$sindent> check<$check> continuation<$continuation> s<$s> cond_lines<$cond_lines> stat_real<$stat_real> stat<$stat>\n"; if ($check && (($sindent % 8) != 0 || ($sindent <= $indent && $s ne ''))) { WARN("SUSPECT_CODE_INDENT", "suspect code indent for conditional statements ($indent, $sindent)\n" . $herecurr . "$stat_real\n"); } } # Track the 'values' across context and added lines. my $opline = $line; $opline =~ s/^./ /; my ($curr_values, $curr_vars) = annotate_values($opline . "\n", $prev_values); $curr_values = $prev_values . $curr_values; if ($dbg_values) { my $outline = $opline; $outline =~ s/\t/ /g; print "$linenr > .$outline\n"; print "$linenr > $curr_values\n"; print "$linenr > $curr_vars\n"; } $prev_values = substr($curr_values, -1); #ignore lines not being added if ($line=~/^[^\+]/) {next;} # TEST: allow direct testing of the type matcher. if ($dbg_type) { if ($line =~ /^.\s*$Declare\s*$/) { ERROR("TEST_TYPE", "TEST: is type\n" . $herecurr); } elsif ($dbg_type > 1 && $line =~ /^.+($Declare)/) { ERROR("TEST_NOT_TYPE", "TEST: is not type ($1 is)\n". $herecurr); } next; } # TEST: allow direct testing of the attribute matcher. if ($dbg_attr) { if ($line =~ /^.\s*$Modifier\s*$/) { ERROR("TEST_ATTR", "TEST: is attr\n" . $herecurr); } elsif ($dbg_attr > 1 && $line =~ /^.+($Modifier)/) { ERROR("TEST_NOT_ATTR", "TEST: is not attr ($1 is)\n". $herecurr); } next; } # check for initialisation to aggregates open brace on the next line if ($line =~ /^.\s*{/ && $prevline =~ /(?:^|[^=])=\s*$/) { ERROR("OPEN_BRACE", "that open brace { should be on the previous line\n" . $hereprev); } # # Checks which are anchored on the added line. # # check for malformed paths in #include statements (uses RAW line) if ($rawline =~ m{^.\s*\#\s*include\s+[<"](.*)[">]}) { my $path = $1; if ($path =~ m{//}) { ERROR("MALFORMED_INCLUDE", "malformed #include filename\n" . $herecurr); } } # no C99 // comments if ($line =~ m{//}) { ERROR("C99_COMMENTS", "do not use C99 // comments\n" . $herecurr); } # Remove C99 comments. $line =~ s@//.*@@; $opline =~ s@//.*@@; # check for global initialisers. if ($line =~ /^.$Type\s*$Ident\s*(?:\s+$Modifier)*\s*=\s*(0|NULL|false)\s*;/) { ERROR("GLOBAL_INITIALISERS", "do not initialise globals to 0 or NULL\n" . $herecurr); } # check for static initialisers. if ($line =~ /\bstatic\s.*=\s*(0|NULL|false)\s*;/) { ERROR("INITIALISED_STATIC", "do not initialise statics to 0 or NULL\n" . $herecurr); } # check for static const char * arrays. if ($line =~ /\bstatic\s+const\s+char\s*\*\s*(\w+)\s*\[\s*\]\s*=\s*/) { WARN("STATIC_CONST_CHAR_ARRAY", "static const char * array should probably be static const char * const\n" . $herecurr); } # check for static char foo[] = "bar" declarations. if ($line =~ /\bstatic\s+char\s+(\w+)\s*\[\s*\]\s*=\s*"/) { WARN("STATIC_CONST_CHAR_ARRAY", "static char array declaration should probably be static const char\n" . $herecurr); } # check for declarations of struct pci_device_id if ($line =~ /\bstruct\s+pci_device_id\s+\w+\s*\[\s*\]\s*\=\s*\{/) { WARN("DEFINE_PCI_DEVICE_TABLE", "Use DEFINE_PCI_DEVICE_TABLE for struct pci_device_id\n" . $herecurr); } # check for new typedefs, only function parameters and sparse annotations # make sense. if ($line =~ /\btypedef\s/ && $line !~ /\btypedef\s+$Type\s*\(\s*\*?$Ident\s*\)\s*\(/ && $line !~ /\btypedef\s+$Type\s+$Ident\s*\(/ && $line !~ /\b$typeTypedefs\b/ && $line !~ /\b__bitwise(?:__|)\b/) { WARN("NEW_TYPEDEFS", "do not add new typedefs\n" . $herecurr); } # * goes on variable not on type # (char*[ const]) while ($line =~ m{(\($NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)\))}g) { #print "AA<$1>\n"; my ($from, $to) = ($2, $2); # Should start with a space. $to =~ s/^(\S)/ $1/; # Should not end with a space. $to =~ s/\s+$//; # '*'s should not have spaces between. while ($to =~ s/\*\s+\*/\*\*/) { } #print "from<$from> to<$to>\n"; if ($from ne $to) { ERROR("POINTER_LOCATION", "\"(foo$from)\" should be \"(foo$to)\"\n" . $herecurr); } } while ($line =~ m{(\b$NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)($Ident))}g) { #print "BB<$1>\n"; my ($from, $to, $ident) = ($2, $2, $3); # Should start with a space. $to =~ s/^(\S)/ $1/; # Should not end with a space. $to =~ s/\s+$//; # '*'s should not have spaces between. while ($to =~ s/\*\s+\*/\*\*/) { } # Modifiers should have spaces. $to =~ s/(\b$Modifier$)/$1 /; #print "from<$from> to<$to> ident<$ident>\n"; if ($from ne $to && $ident !~ /^$Modifier$/) { ERROR("POINTER_LOCATION", "\"foo${from}bar\" should be \"foo${to}bar\"\n" . $herecurr); } } # function brace can't be on same line, except for #defines of do while, # or if closed on same line if (($line=~/$Type\s*$Ident\(.*\).*\s{/) and !($line=~/\#\s*define.*do\s{/) and !($line=~/}/)) { ERROR("OPEN_BRACE", "open brace '{' following function declarations go on the next line\n" . $herecurr); } # open braces for enum, union and struct go on the same line. if ($line =~ /^.\s*{/ && $prevline =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?\s*$/) { ERROR("OPEN_BRACE", "open brace '{' following $1 go on the same line\n" . $hereprev); } # missing space after union, struct or enum definition if ($line =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?(?:\s+$Ident)?[=\{]/) { WARN("SPACING", "missing space after $1 definition\n" . $herecurr); } # check for spacing round square brackets; allowed: # 1. with a type on the left -- int [] a; # 2. at the beginning of a line for slice initialisers -- [0...10] = 5, # 3. inside a curly brace -- = { [0...10] = 5 } while ($line =~ /(.*?\s)\[/g) { my ($where, $prefix) = ($-[1], $1); if ($prefix !~ /$Type\s+$/ && ($where != 0 || $prefix !~ /^.\s+$/) && $prefix !~ /{\s+$/) { ERROR("BRACKET_SPACE", "space prohibited before open square bracket '['\n" . $herecurr); } } # check for spaces between functions and their parentheses. while ($line =~ /($Ident)\s+\(/g) { my $name = $1; my $ctx_before = substr($line, 0, $-[1]); my $ctx = "$ctx_before$name"; # Ignore those directives where spaces _are_ permitted. if ($name =~ /^(?: if|for|while|switch|return|case| volatile|__volatile__| __attribute__|format|__extension__| asm|__asm__)$/x) { # cpp #define statements have non-optional spaces, ie # if there is a space between the name and the open # parenthesis it is simply not a parameter group. } elsif ($ctx_before =~ /^.\s*\#\s*define\s*$/) { # cpp #elif statement condition may start with a ( } elsif ($ctx =~ /^.\s*\#\s*elif\s*$/) { # If this whole things ends with a type its most # likely a typedef for a function. } elsif ($ctx =~ /$Type$/) { } else { WARN("SPACING", "space prohibited between function name and open parenthesis '('\n" . $herecurr); } } # Check operator spacing. if (!($line=~/\#\s*include/)) { my $ops = qr{ <<=|>>=|<=|>=|==|!=| \+=|-=|\*=|\/=|%=|\^=|\|=|&=| =>|->|<<|>>|<|>|=|!|~| &&|\|\||,|\^|\+\+|--|&|\||\+|-|\*|\/|%| \?|: }x; my @elements = split(/($ops|;)/, $opline); my $off = 0; my $blank = copy_spacing($opline); for (my $n = 0; $n < $#elements; $n += 2) { $off += length($elements[$n]); # Pick up the preceding and succeeding characters. my $ca = substr($opline, 0, $off); my $cc = ''; if (length($opline) >= ($off + length($elements[$n + 1]))) { $cc = substr($opline, $off + length($elements[$n + 1])); } my $cb = "$ca$;$cc"; my $a = ''; $a = 'V' if ($elements[$n] ne ''); $a = 'W' if ($elements[$n] =~ /\s$/); $a = 'C' if ($elements[$n] =~ /$;$/); $a = 'B' if ($elements[$n] =~ /(\[|\()$/); $a = 'O' if ($elements[$n] eq ''); $a = 'E' if ($ca =~ /^\s*$/); my $op = $elements[$n + 1]; my $c = ''; if (defined $elements[$n + 2]) { $c = 'V' if ($elements[$n + 2] ne ''); $c = 'W' if ($elements[$n + 2] =~ /^\s/); $c = 'C' if ($elements[$n + 2] =~ /^$;/); $c = 'B' if ($elements[$n + 2] =~ /^(\)|\]|;)/); $c = 'O' if ($elements[$n + 2] eq ''); $c = 'E' if ($elements[$n + 2] =~ /^\s*\\$/); } else { $c = 'E'; } my $ctx = "${a}x${c}"; my $at = "(ctx:$ctx)"; my $ptr = substr($blank, 0, $off) . "^"; my $hereptr = "$hereline$ptr\n"; # Pull out the value of this operator. my $op_type = substr($curr_values, $off + 1, 1); # Get the full operator variant. my $opv = $op . substr($curr_vars, $off, 1); # Ignore operators passed as parameters. if ($op_type ne 'V' && $ca =~ /\s$/ && $cc =~ /^\s*,/) { # # Ignore comments # } elsif ($op =~ /^$;+$/) { # ; should have either the end of line or a space or \ after it } elsif ($op eq ';') { if ($ctx !~ /.x[WEBC]/ && $cc !~ /^\\/ && $cc !~ /^;/) { ERROR("SPACING", "space required after that '$op' $at\n" . $hereptr); } # // is a comment } elsif ($op eq '//') { # No spaces for: # -> # : when part of a bitfield } elsif ($op eq '->' || $opv eq ':B') { if ($ctx =~ /Wx.|.xW/) { ERROR("SPACING", "spaces prohibited around that '$op' $at\n" . $hereptr); } # , must have a space on the right. } elsif ($op eq ',') { if ($ctx !~ /.x[WEC]/ && $cc !~ /^}/) { ERROR("SPACING", "space required after that '$op' $at\n" . $hereptr); } # '*' as part of a type definition -- reported already. } elsif ($opv eq '*_') { #warn "'*' is part of type\n"; # unary operators should have a space before and # none after. May be left adjacent to another # unary operator, or a cast } elsif ($op eq '!' || $op eq '~' || $opv eq '*U' || $opv eq '-U' || $opv eq '&U' || $opv eq '&&U') { if ($ctx !~ /[WEBC]x./ && $ca !~ /(?:\)|!|~|\*|-|\&|\||\+\+|\-\-|\{)$/) { ERROR("SPACING", "space required before that '$op' $at\n" . $hereptr); } if ($op eq '*' && $cc =~/\s*$Modifier\b/) { # A unary '*' may be const } elsif ($ctx =~ /.xW/) { ERROR("SPACING", "space prohibited after that '$op' $at\n" . $hereptr); } # unary ++ and unary -- are allowed no space on one side. } elsif ($op eq '++' or $op eq '--') { if ($ctx !~ /[WEOBC]x[^W]/ && $ctx !~ /[^W]x[WOBEC]/) { ERROR("SPACING", "space required one side of that '$op' $at\n" . $hereptr); } if ($ctx =~ /Wx[BE]/ || ($ctx =~ /Wx./ && $cc =~ /^;/)) { ERROR("SPACING", "space prohibited before that '$op' $at\n" . $hereptr); } if ($ctx =~ /ExW/) { ERROR("SPACING", "space prohibited after that '$op' $at\n" . $hereptr); } # << and >> may either have or not have spaces both sides } elsif ($op eq '<<' or $op eq '>>' or $op eq '&' or $op eq '^' or $op eq '|' or $op eq '+' or $op eq '-' or $op eq '*' or $op eq '/' or $op eq '%') { if ($ctx =~ /Wx[^WCE]|[^WCE]xW/) { ERROR("SPACING", "need consistent spacing around '$op' $at\n" . $hereptr); } # A colon needs no spaces before when it is # terminating a case value or a label. } elsif ($opv eq ':C' || $opv eq ':L') { if ($ctx =~ /Wx./) { ERROR("SPACING", "space prohibited before that '$op' $at\n" . $hereptr); } # All the others need spaces both sides. } elsif ($ctx !~ /[EWC]x[CWE]/) { my $ok = 0; # Ignore email addresses if (($op eq '<' && $cc =~ /^\S+\@\S+>/) || ($op eq '>' && $ca =~ /<\S+\@\S+$/)) { $ok = 1; } # Ignore ?: if (($opv eq ':O' && $ca =~ /\?$/) || ($op eq '?' && $cc =~ /^:/)) { $ok = 1; } if ($ok == 0) { ERROR("SPACING", "spaces required around that '$op' $at\n" . $hereptr); } } $off += length($elements[$n + 1]); } } # check for multiple assignments if ($line =~ /^.\s*$Lval\s*=\s*$Lval\s*=(?!=)/) { CHK("MULTIPLE_ASSIGNMENTS", "multiple assignments should be avoided\n" . $herecurr); } #need space before brace following if, while, etc if (($line =~ /\(.*\){/ && $line !~ /\($Type\){/) || $line =~ /do{/) { ERROR("SPACING", "space required before the open brace '{'\n" . $herecurr); } # closing brace should have a space following it when it has anything # on the line if ($line =~ /}(?!(?:,|;|\)))\S/) { ERROR("SPACING", "space required after that close brace '}'\n" . $herecurr); } # check spacing on square brackets if ($line =~ /\[\s/ && $line !~ /\[\s*$/) { ERROR("SPACING", "space prohibited after that open square bracket '['\n" . $herecurr); } if ($line =~ /\s\]/) { ERROR("SPACING", "space prohibited before that close square bracket ']'\n" . $herecurr); } # check spacing on parentheses if ($line =~ /\(\s/ && $line !~ /\(\s*(?:\\)?$/ && $line !~ /for\s*\(\s+;/) { ERROR("SPACING", "space prohibited after that open parenthesis '('\n" . $herecurr); } if ($line =~ /(\s+)\)/ && $line !~ /^.\s*\)/ && $line !~ /for\s*\(.*;\s+\)/ && $line !~ /:\s+\)/) { ERROR("SPACING", "space prohibited before that close parenthesis ')'\n" . $herecurr); } #goto labels aren't indented, allow a single space however if ($line=~/^.\s+[A-Za-z\d_]+:(?![0-9]+)/ and !($line=~/^. [A-Za-z\d_]+:/) and !($line=~/^.\s+default:/)) { WARN("INDENTED_LABEL", "labels should not be indented\n" . $herecurr); } # Return is not a function. if (defined($stat) && $stat =~ /^.\s*return(\s*)(\(.*);/s) { my $spacing = $1; my $value = $2; # Flatten any parentheses $value =~ s/\(/ \(/g; $value =~ s/\)/\) /g; while ($value =~ s/\[[^\[\]]*\]/1/ || $value !~ /(?:$Ident|-?$Constant)\s* $Compare\s* (?:$Ident|-?$Constant)/x && $value =~ s/\([^\(\)]*\)/1/) { } #print "value<$value>\n"; if ($value =~ /^\s*(?:$Ident|-?$Constant)\s*$/) { ERROR("RETURN_PARENTHESES", "return is not a function, parentheses are not required\n" . $herecurr); } elsif ($spacing !~ /\s+/) { ERROR("SPACING", "space required before the open parenthesis '('\n" . $herecurr); } } # Return of what appears to be an errno should normally be -'ve if ($line =~ /^.\s*return\s*(E[A-Z]*)\s*;/) { my $name = $1; if ($name ne 'EOF' && $name ne 'ERROR') { WARN("USE_NEGATIVE_ERRNO", "return of an errno should typically be -ve (return -$1)\n" . $herecurr); } } # Need a space before open parenthesis after if, while etc if ($line=~/\b(if|while|for|switch)\(/) { ERROR("SPACING", "space required before the open parenthesis '('\n" . $herecurr); } # Check for illegal assignment in if conditional -- and check for trailing # statements after the conditional. if ($line =~ /do\s*(?!{)/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0) if (!defined $stat); my ($stat_next) = ctx_statement_block($line_nr_next, $remain_next, $off_next); $stat_next =~ s/\n./\n /g; ##print "stat<$stat> stat_next<$stat_next>\n"; if ($stat_next =~ /^\s*while\b/) { # If the statement carries leading newlines, # then count those as offsets. my ($whitespace) = ($stat_next =~ /^((?:\s*\n[+-])*\s*)/s); my $offset = statement_rawlines($whitespace) - 1; $suppress_whiletrailers{$line_nr_next + $offset} = 1; } } if (!defined $suppress_whiletrailers{$linenr} && $line =~ /\b(?:if|while|for)\s*\(/ && $line !~ /^.\s*#/) { my ($s, $c) = ($stat, $cond); if ($c =~ /\bif\s*\(.*[^<>!=]=[^=].*/s) { ERROR("ASSIGN_IN_IF", "do not use assignment in if condition\n" . $herecurr); } # Find out what is on the end of the line after the # conditional. substr($s, 0, length($c), ''); $s =~ s/\n.*//g; $s =~ s/$;//g; # Remove any comments if (length($c) && $s !~ /^\s*{?\s*\\*\s*$/ && $c !~ /}\s*while\s*/) { # Find out how long the conditional actually is. my @newlines = ($c =~ /\n/gs); my $cond_lines = 1 + $#newlines; my $stat_real = ''; $stat_real = raw_line($linenr, $cond_lines) . "\n" if ($cond_lines); if (defined($stat_real) && $cond_lines > 1) { $stat_real = "[...]\n$stat_real"; } ERROR("TRAILING_STATEMENTS", "trailing statements should be on next line\n" . $herecurr . $stat_real); } } # Check for bitwise tests written as boolean if ($line =~ / (?: (?:\[|\(|\&\&|\|\|) \s*0[xX][0-9]+\s* (?:\&\&|\|\|) | (?:\&\&|\|\|) \s*0[xX][0-9]+\s* (?:\&\&|\|\||\)|\]) )/x) { WARN("HEXADECIMAL_BOOLEAN_TEST", "boolean test with hexadecimal, perhaps just 1 \& or \|?\n" . $herecurr); } # if and else should not have general statements after it if ($line =~ /^.\s*(?:}\s*)?else\b(.*)/) { my $s = $1; $s =~ s/$;//g; # Remove any comments if ($s !~ /^\s*(?:\sif|(?:{|)\s*\\?\s*$)/) { ERROR("TRAILING_STATEMENTS", "trailing statements should be on next line\n" . $herecurr); } } # if should not continue a brace if ($line =~ /}\s*if\b/) { ERROR("TRAILING_STATEMENTS", "trailing statements should be on next line\n" . $herecurr); } # case and default should not have general statements after them if ($line =~ /^.\s*(?:case\s*.*|default\s*):/g && $line !~ /\G(?: (?:\s*$;*)(?:\s*{)?(?:\s*$;*)(?:\s*\\)?\s*$| \s*return\s+ )/xg) { ERROR("TRAILING_STATEMENTS", "trailing statements should be on next line\n" . $herecurr); } # Check for }else {, these must be at the same # indent level to be relevant to each other. if ($prevline=~/}\s*$/ and $line=~/^.\s*else\s*/ and $previndent == $indent) { ERROR("ELSE_AFTER_BRACE", "else should follow close brace '}'\n" . $hereprev); } if ($prevline=~/}\s*$/ and $line=~/^.\s*while\s*/ and $previndent == $indent) { my ($s, $c) = ctx_statement_block($linenr, $realcnt, 0); # Find out what is on the end of the line after the # conditional. substr($s, 0, length($c), ''); $s =~ s/\n.*//g; if ($s =~ /^\s*;/) { ERROR("WHILE_AFTER_BRACE", "while should follow close brace '}'\n" . $hereprev); } } #no spaces allowed after \ in define if ($line=~/\#\s*define.*\\\s$/) { WARN("WHITESPACE_AFTER_LINE_CONTINUATION", "Whitepspace after \\ makes next lines useless\n" . $herecurr); } # check for redundant bracing round if etc if ($line =~ /(^.*)\bif\b/ && $1 !~ /else\s*$/) { my ($level, $endln, @chunks) = ctx_statement_full($linenr, $realcnt, 1); #print "chunks<$#chunks> linenr<$linenr> endln<$endln> level<$level>\n"; #print "APW: <<$chunks[1][0]>><<$chunks[1][1]>>\n"; if ($#chunks > 0 && $level == 0) { my $allowed = 0; my $seen = 0; my $herectx = $here . "\n"; my $ln = $linenr - 1; for my $chunk (@chunks) { my ($cond, $block) = @{$chunk}; # If the condition carries leading newlines, then count those as offsets. my ($whitespace) = ($cond =~ /^((?:\s*\n[+-])*\s*)/s); my $offset = statement_rawlines($whitespace) - 1; #print "COND<$cond> whitespace<$whitespace> offset<$offset>\n"; # We have looked at and allowed this specific line. $suppress_ifbraces{$ln + $offset} = 1; $herectx .= "$rawlines[$ln + $offset]\n[...]\n"; $ln += statement_rawlines($block) - 1; substr($block, 0, length($cond), ''); $seen++ if ($block =~ /^\s*{/); #print "cond<$cond> block<$block> allowed<$allowed>\n"; if (statement_lines($cond) > 1) { #print "APW: ALLOWED: cond<$cond>\n"; $allowed = 1; } if ($block =~/\b(?:if|for|while)\b/) { #print "APW: ALLOWED: block<$block>\n"; $allowed = 1; } if (statement_block_size($block) > 1) { #print "APW: ALLOWED: lines block<$block>\n"; $allowed = 1; } } if ($seen && !$allowed) { WARN("BRACES", "braces {} are not necessary for any arm of this statement\n" . $herectx); } } } if (!defined $suppress_ifbraces{$linenr - 1} && $line =~ /\b(if|while|for|else)\b/) { my $allowed = 0; # Check the pre-context. if (substr($line, 0, $-[0]) =~ /(\}\s*)$/) { #print "APW: ALLOWED: pre<$1>\n"; $allowed = 1; } my ($level, $endln, @chunks) = ctx_statement_full($linenr, $realcnt, $-[0]); # Check the condition. my ($cond, $block) = @{$chunks[0]}; #print "CHECKING<$linenr> cond<$cond> block<$block>\n"; if (defined $cond) { substr($block, 0, length($cond), ''); } if (statement_lines($cond) > 1) { #print "APW: ALLOWED: cond<$cond>\n"; $allowed = 1; } if ($block =~/\b(?:if|for|while)\b/) { #print "APW: ALLOWED: block<$block>\n"; $allowed = 1; } if (statement_block_size($block) > 1) { #print "APW: ALLOWED: lines block<$block>\n"; $allowed = 1; } # Check the post-context. if (defined $chunks[1]) { my ($cond, $block) = @{$chunks[1]}; if (defined $cond) { substr($block, 0, length($cond), ''); } if ($block =~ /^\s*\{/) { #print "APW: ALLOWED: chunk-1 block<$block>\n"; $allowed = 1; } } if ($level == 0 && $block =~ /^\s*\{/ && !$allowed) { my $herectx = $here . "\n"; my $cnt = statement_rawlines($block); for (my $n = 0; $n < $cnt; $n++) { $herectx .= raw_line($linenr, $n) . "\n"; } WARN("BRACES", "braces {} are not necessary for single statement blocks\n" . $herectx); } } # no volatiles please my $asm_volatile = qr{\b(__asm__|asm)\s+(__volatile__|volatile)\b}; if ($line =~ /\bvolatile\b/ && $line !~ /$asm_volatile/) { WARN("VOLATILE", "Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt\n" . $herecurr); } # warn about #if 0 if ($line =~ /^.\s*\#\s*if\s+0\b/) { CHK("REDUNDANT_CODE", "if this code is redundant consider removing it\n" . $herecurr); } # warn about spacing in #ifdefs if ($line =~ /^.\s*\#\s*(ifdef|ifndef|elif)\s\s+/) { ERROR("SPACING", "exactly one space required after that #$1\n" . $herecurr); } # Check that the storage class is at the beginning of a declaration if ($line =~ /\b$Storage\b/ && $line !~ /^.\s*$Storage\b/) { WARN("STORAGE_CLASS", "storage class should be at the beginning of the declaration\n" . $herecurr) } # check the location of the inline attribute, that it is between # storage class and type. if ($line =~ /\b$Type\s+$Inline\b/ || $line =~ /\b$Inline\s+$Storage\b/) { ERROR("INLINE_LOCATION", "inline keyword should sit between storage class and type\n" . $herecurr); } # Check for __inline__ and __inline, prefer inline if ($line =~ /\b(__inline__|__inline)\b/) { WARN("INLINE", "plain inline is preferred over $1\n" . $herecurr); } # Check for __attribute__ packed, prefer __packed if ($line =~ /\b__attribute__\s*\(\s*\(.*\bpacked\b/) { WARN("PREFER_PACKED", "__packed is preferred over __attribute__((packed))\n" . $herecurr); } # Check for __attribute__ aligned, prefer __aligned if ($line =~ /\b__attribute__\s*\(\s*\(.*aligned/) { WARN("PREFER_ALIGNED", "__aligned(size) is preferred over __attribute__((aligned(size)))\n" . $herecurr); } # Check for __attribute__ format(printf, prefer __printf if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) { WARN("PREFER_PRINTF", "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr); } # check for sizeof(&) if ($line =~ /\bsizeof\s*\(\s*\&/) { WARN("SIZEOF_ADDRESS", "sizeof(& should be avoided\n" . $herecurr); } # check for line continuations in quoted strings with odd counts of " if ($rawline =~ /\\$/ && $rawline =~ tr/"/"/ % 2) { WARN("LINE_CONTINUATIONS", "Avoid line continuations in quoted strings\n" . $herecurr); } # Check for misused memsets if (defined $stat && $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*$FuncArg\s*\)/s) { my $ms_addr = $2; my $ms_val = $8; my $ms_size = $14; if ($ms_size =~ /^(0x|)0$/i) { ERROR("MEMSET", "memset to 0's uses 0 as the 2nd argument, not the 3rd\n" . "$here\n$stat\n"); } elsif ($ms_size =~ /^(0x|)1$/i) { WARN("MEMSET", "single byte memset is suspicious. Swapped 2nd/3rd argument?\n" . "$here\n$stat\n"); } } # typecasts on min/max could be min_t/max_t if (defined $stat && $stat =~ /^\+(?:.*?)\b(min|max)\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\)/) { if (defined $2 || defined $8) { my $call = $1; my $cast1 = deparenthesize($2); my $arg1 = $3; my $cast2 = deparenthesize($8); my $arg2 = $9; my $cast; if ($cast1 ne "" && $cast2 ne "") { $cast = "$cast1 or $cast2"; } elsif ($cast1 ne "") { $cast = $cast1; } else { $cast = $cast2; } WARN("MINMAX", "$call() should probably be ${call}_t($cast, $arg1, $arg2)\n" . "$here\n$stat\n"); } } # check for new externs in .c files. if ($realfile =~ /\.c$/ && defined $stat && $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s) { my $function_name = $1; my $paren_space = $2; my $s = $stat; if (defined $cond) { substr($s, 0, length($cond), ''); } if ($s =~ /^\s*;/ && $function_name ne 'uninitialized_var') { WARN("AVOID_EXTERNS", "externs should be avoided in .c files\n" . $herecurr); } if ($paren_space =~ /\n/) { WARN("FUNCTION_ARGUMENTS", "arguments for function declarations should follow identifier\n" . $herecurr); } } elsif ($realfile =~ /\.c$/ && defined $stat && $stat =~ /^.\s*extern\s+/) { WARN("AVOID_EXTERNS", "externs should be avoided in .c files\n" . $herecurr); } # check for multiple semicolons if ($line =~ /;\s*;\s*$/) { WARN("ONE_SEMICOLON", "Statements terminations use 1 semicolon\n" . $herecurr); } # check for gcc specific __FUNCTION__ if ($line =~ /__FUNCTION__/) { WARN("USE_FUNC", "__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr); } # check for %L{u,d,i} in strings my $string; while ($line =~ /(?:^|")([X\t]*)(?:"|$)/g) { $string = substr($rawline, $-[1], $+[1] - $-[1]); $string =~ s/%%/__/g; if ($string =~ /(?> 3) - ($pos >> 3); $lo .= "\t" x $ntab; $pos = $npos; $nsp = 0; } elsif ($c eq "\n" || $c eq "\r") { $lo .= " " x $nsp; $pos += $nsp; $nsp = 0; $lo .= $c; $pos = 0; } elsif ($c eq " ") { $nsp++; } else { $lo .= " " x $nsp; $pos += $nsp; $nsp = 0; $lo .= $c; $pos++; } } $lo .= " " x $nsp; return $lo; } # Compute the visual width of a string sub strwidth($) { no bytes; # Tab alignment depends on characters my($li) = @_; my($c, $i); my $pos = 0; my $mlen = 0; for ($i = 0; $i < length($li); $i++) { $c = substr($li,$i,1); if ($c eq "\t") { $pos = ($pos+8) & ~7; } elsif ($c eq "\n") { $mlen = $pos if ($pos > $mlen); $pos = 0; } else { $pos++; } } $mlen = $pos if ($pos > $mlen); return $mlen; } $name = basename($0); @files = (); while (defined($a = shift(@ARGV))) { if ($a =~ /^-/) { if ($a eq '-width' || $a eq '-w') { $max_width = shift(@ARGV)+0; } else { print STDERR "Usage: $name [-width #] files...\n"; exit 1; } } else { push(@files, $a); } } foreach $f ( @files ) { print STDERR "$name: $f\n"; if (! -f $f) { print STDERR "$f: not a file\n"; next; } if (!open(FILE, '+<', $f)) { print STDERR "$name: Cannot open file: $f: $!\n"; next; } binmode FILE; # First, verify that it is not a binary file; consider any file # with a zero byte to be a binary file. Is there any better, or # additional, heuristic that should be applied? $is_binary = 0; while (read(FILE, $data, 65536) > 0) { if ($data =~ /\0/) { $is_binary = 1; last; } } if ($is_binary) { print STDERR "$name: $f: binary file\n"; next; } seek(FILE, 0, 0); $in_bytes = 0; $out_bytes = 0; $lineno = 0; @lines = (); $in_hunk = 0; $err = 0; while ( defined($line = ) ) { $lineno++; $in_bytes += length($line); if (!$in_hunk) { if ($line =~ /^\@\@\s+\-([0-9]+),([0-9]+)\s+\+([0-9]+),([0-9]+)\s\@\@/) { $minus_lines = $2; $plus_lines = $4; if ($minus_lines || $plus_lines) { $in_hunk = 1; @hunk_lines = ($line); } } else { push(@lines, $line); $out_bytes += length($line); } } else { # We're in a hunk if ($line =~ /^\+/) { $plus_lines--; $text = substr($line, 1); $text =~ s/[ \t\r]*$//; # Remove trailing spaces $text = clean_space_tabs($text); $l_width = strwidth($text); if ($max_width && $l_width > $max_width) { print STDERR "$f:$lineno: adds line exceeds $max_width ", "characters ($l_width)\n"; } push(@hunk_lines, '+'.$text); } elsif ($line =~ /^\-/) { $minus_lines--; push(@hunk_lines, $line); } elsif ($line =~ /^ /) { $plus_lines--; $minus_lines--; push(@hunk_lines, $line); } else { print STDERR "$name: $f: malformed patch\n"; $err = 1; last; } if ($plus_lines < 0 || $minus_lines < 0) { print STDERR "$name: $f: malformed patch\n"; $err = 1; last; } elsif ($plus_lines == 0 && $minus_lines == 0) { # End of a hunk. Process this hunk. my $i; my $l; my @h = (); my $adj = 0; my $done = 0; for ($i = scalar(@hunk_lines)-1; $i > 0; $i--) { $l = $hunk_lines[$i]; if (!$done && $l eq "+\n") { $adj++; # Skip this line } elsif ($l =~ /^[ +]/) { $done = 1; unshift(@h, $l); } else { unshift(@h, $l); } } $l = $hunk_lines[0]; # Hunk header undef @hunk_lines; # Free memory if ($adj) { die unless ($l =~ /^\@\@\s+\-([0-9]+),([0-9]+)\s+\+([0-9]+),([0-9]+)\s\@\@(.*)$/); my $mstart = $1; my $mlin = $2; my $pstart = $3; my $plin = $4; my $tail = $5; # doesn't include the final newline $l = sprintf("@@ -%d,%d +%d,%d @@%s\n", $mstart, $mlin, $pstart, $plin-$adj, $tail); } unshift(@h, $l); # Transfer to the output array foreach $l (@h) { $out_bytes += length($l); push(@lines, $l); } $in_hunk = 0; } } } if ($in_hunk) { print STDERR "$name: $f: malformed patch\n"; $err = 1; } if (!$err) { if ($in_bytes != $out_bytes) { # Only write to the file if changed seek(FILE, 0, 0); print FILE @lines; if ( !defined($where = tell(FILE)) || !truncate(FILE, $where) ) { die "$name: Failed to truncate modified file: $f: $!\n"; } } } close(FILE); } sheepdog-0.7.5/script/gen_bash_completion.pl000077500000000000000000000073301223630776600212030ustar00rootroot00000000000000#!/usr/bin/perl # # Genrate bash_completion_dog # use strict; my ($program) = @ARGV; print "#!bash\n"; print "\n"; open IN, "$program -h |" or die "cannot find $program\n"; my @help = ; close IN; # Hash of sub command arrays. # E.g. $subcmds{'node'} = [kill, list, info, recovery, md] my %subcmds; # Hash of sub sub command arrays. # E.g. $subsubcmds{'trace graph'} = [cat, stat] my %subsubcmds; # Hash of option arrays. # E.g. $opts{'node list'} = [-a, --address, -p, --port, -r, --raw, -h, --help] my %opts; foreach (@help) { if (/^ (\S+) (\S+)/) { my ($cmd, $subcmd) = ($1, $2); $subcmds{$cmd} = [] if (!defined($subcmds{$cmd})); push @{$subcmds{$cmd}}, $subcmd; $opts{"$cmd $subcmd"} = []; $subsubcmds{"$cmd $subcmd"} = []; # run sub command to get more detailed usage open IN, "$program $cmd $subcmd -h |"; while () { if (/^ (-.), (--\S+)/) { # get options push @{$opts{"$cmd $subcmd"}}, $1; push @{$opts{"$cmd $subcmd"}}, $2; } elsif (/^ ([a-z]+)/) { # get available subcommands push @{$subsubcmds{"$cmd $subcmd"}}, $1; } } close IN; } } foreach my $cmd (keys %subcmds) { my @subcmds = @{$subcmds{$cmd}}; print command($cmd, @subcmds); foreach my $subcmd (@subcmds) { print subcommand($cmd, $subcmd); } } print <<__EOB__; _dog() { local opts cur cmd subcmd opts="@{[keys %subcmds]}" cur="\${COMP_WORDS[COMP_CWORD]}" if [ \$COMP_CWORD -gt 1 ]; then cmd=\${COMP_WORDS[1]} fi if [ \$COMP_CWORD -gt 2 ]; then subcmd=\${COMP_WORDS[2]} fi case "\${cmd}" in __EOB__ foreach my $cmd (keys %subcmds) { print <<__EOB__; $cmd) _dog_$cmd \${subcmd} ;; __EOB__ } print <<__EOB__; "") COMPREPLY=(\$( compgen -W "\${opts}" -- \${cur} )) ;; *) COMPREPLY=() ;; esac } complete -F _dog dog __EOB__ exit 0; # get a completion function for dog command (e.g. _dog_vdi()) sub command { my ($cmd, @subcmds) = @_; my $output; $output = <<__EOB__; _dog_${cmd}() { local opts opts="@subcmds" case "\$1" in __EOB__ foreach my $subcmd (@subcmds) { $output .= <<__EOB__; $subcmd) _dog_${cmd}_${subcmd} ;; __EOB__ } $output .= <<__EOB__; "") COMPREPLY=(\$( compgen \\ -W "\${opts}" \\ -- "\${COMP_WORDS[COMP_CWORD]}" )) ;; *) COMPREPLY=() ;; esac } __EOB__ $output =~ s/\t/ /g; $output =~ s/^ //gm; return $output; } # get a completion function for dog subcommands (e.g. _dog_vdi_create()) sub subcommand { my ($cmd, $subcmd) = @_; my $output; my @opts = @{$opts{"$cmd $subcmd"}}; my @subsubcmds = @{$subsubcmds{"$cmd $subcmd"}}; $output = <<__EOB__; _dog_${cmd}_${subcmd}() { local cur cur="\${COMP_WORDS[COMP_CWORD]}" case "\$cur" in -*) COMPREPLY=(\${COMPREPLY[@]} \\ \$( compgen \\ -W "@opts" \\ __EOB__ $output .= <<__EOB__; -- \${cur} )) ;; __EOB__ if ($cmd eq 'vdi' && $subcmd ne 'create') { $output .= <<__EOB__; *) local dog="\${COMP_WORDS[0]}" local vdilist="\$(\${dog} vdi list -r 2>/dev/null | awk '{print \$2}')" COMPREPLY=(\$( compgen -W "@subsubcmds \${vdilist}" -- \${cur} )) ;; __EOB__ } else { $output .= <<__EOB__; *) COMPREPLY=(\$( compgen -W "@subsubcmds" -- \${cur} )) ;; __EOB__ } $output .= <<__EOB__; esac } __EOB__ $output =~ s/\t/ /g; $output =~ s/^ //gm; return $output; } sheepdog-0.7.5/script/gen_man.pl000077500000000000000000000031371223630776600166110ustar00rootroot00000000000000#!/usr/bin/perl # # Genrate sheepdog manuals from help messages # use strict; my ($cwd) = ($0 =~ m%^(.+/)%); my $program = $ARGV[0]; ## generator functions sub sheep { my ($line) = @_; if ($line =~ /^ ([^,]+), (\S+)\s+(.+)/) { my ($opt, $longopt, $desc) = ($1, $2, $3); print escape(header("$opt, $longopt") . "\n"); print escape("$desc\n"); next if ($opt eq '-h'); # extract detailed help if available my $tmpfile = `mktemp`; chomp($tmpfile); my $help = `$program $tmpfile $opt 2> /dev/null`; unlink $tmpfile; $help =~ s/^\s+\$.+/\n$&\n/mg; print escape("\n$help"); } } sub dog { my ($line) = @_; if ($line =~ /^ (.+?) \s+(.+)/) { my ($cmd, $desc) = ($1, $2); my $help = join '', `$program $cmd -h`; $help =~ s/Usage: dog (.*)/header($1)/e; $help =~ s/^([A-Z][ a-zA-Z]*:)/\n$1/mg; print escape("$help\n"); print escape("Description:\n $desc\n"); } } sub sheepfs { my ($line) = @_; if ($line =~ /^ ([^,]+), (\S+)\s+(.+)/) { my ($opt, $longopt, $desc) = ($1, $2, $3); print escape(header("$opt, $longopt") . "\n"); print escape("$desc\n"); } } ## helper functions sub header { my ($str) = @_; return ".TP\n.BI \"$str\""; } sub escape { my ($str) = @_; $str =~ s/\t/ /g; $str =~ s/\\/\\\\\\/g; $str =~ s/"/\\"/g; $str =~ s/#/\\#/g; $str =~ s/\$/\\\$/g; $str =~ s/\n/\\n/g; return $str; } ## main routine open IN, "$program -h |" or die "cannot find $program\n"; my @help = ; close IN; foreach my $help (@help) { my ($func) = ($program =~ m#.*/(.+)#); chomp($help); eval "$func(\"$help\")"; } sheepdog-0.7.5/script/json_log_viewer.py000077500000000000000000000140171223630776600204140ustar00rootroot00000000000000#! /usr/bin/env python import sys, os, errno import json, curses import atexit begin_sec, begin_usec = -1, -1 class LogRecord(object): def __init__(self, json_line, proc): json_obj = json.loads(json_line) user_info = json_obj['user_info'] self.progname = user_info['program_name'] self.port = user_info['port'] body = json_obj['body'] self.timestamp = { 'sec': body['second'], 'usec': body['usecond']} self.worker_name = body['worker_name'] self.worker_idx = body['worker_idx'] self.func = body['func'] self.line = body['line'] self.msg = body['msg'] self.proc = proc self.color = None def is_sheep(self): return self.progname == 'sheep' def get_color(self): return self.proc.color def pop(self): ret = self.proc.__pop_next_record__() assert ret == self return ret def __lt__(self, other): if self.timestamp['sec'] < other.timestamp['sec']: return True elif other.timestamp['sec'] < self.timestamp['sec']: return False if self.timestamp['usec'] < other.timestamp['usec']: return True return False def format_line(self, max_x): sec = self.timestamp['sec'] usec = self.timestamp['usec'] udelta = usec - begin_usec if udelta < 0: udelta += 1000000 sec -= 1 t = '%d.%06d' % (sec - begin_sec, udelta) ret = '%s+%s: ' % (' ' * (10 - len(t[:10])), t[:10]) if self.progname == 'sheep': hdr = 'sheep %d,%s(%d) ' % \ (self.port, self.func, self.line) ret += hdr[:40] + ' ' * (40 - len(hdr[:40]) + 1) ret += self.msg return ret[:max_x - 1] return self.msg class Process(object): def __init__(self, log_file_path): self.log_file = open(log_file_path) self.next_record = None self.color = None def set_color(self, color): self.color = color def peek_next_record(self): if self.next_record == None: next_line = self.log_file.readline() if next_line == '': # end of the log return None self.next_record = LogRecord(next_line, self) return self.next_record # __pop_next_record__() must be called by LogRecord def __pop_next_record__(self): assert self.next_record != None ret = self.next_record self.next_record = None return ret dying_msg = '' w = None curses_colors = [ curses.COLOR_RED, curses.COLOR_GREEN, curses.COLOR_YELLOW, curses.COLOR_BLUE, curses.COLOR_MAGENTA, curses.COLOR_CYAN, ] nr_curses_colors = len(curses_colors) def init_curses(): global w w = curses.initscr() curses.nonl() curses.cbreak() curses.noecho() curses.start_color() for i in range(1, nr_curses_colors + 1): curses.init_pair(i, curses_colors[i - 1], curses.COLOR_BLACK) def assign_color(procs): sheeps = [] for proc in procs: if proc.peek_next_record().is_sheep(): sheeps.append(proc) nr_sheeps = len(sheeps) if nr_curses_colors < nr_sheeps: # we don't have enough colors to assign... return for i in range(0, nr_sheeps): sheeps[i].set_color(i + 1) current_y = 0 max_y, max_x = 0, 0 records = [] records_len = 0 def unify_records(procs): first_rec = procs[0].peek_next_record() for proc in procs[1:]: rec = proc.peek_next_record() if rec < first_rec: first_rec = rec records.append(first_rec.pop()) global begin_sec, begin_usec begin_sec = first_rec.timestamp['sec'] begin_usec = first_rec.timestamp['usec'] nr_procs = len(procs) is_empty = [False] * nr_procs nr_empteis = 0 while nr_empteis != nr_procs: next_rec = None for i in range(0, nr_procs): if is_empty[i]: continue proc = procs[i] rec = proc.peek_next_record() if rec == None: is_empty[i] = True nr_empteis += 1 continue if next_rec == None: next_rec = rec continue if rec < next_rec: next_rec = rec continue if next_rec == None: assert nr_empteis == nr_procs break records.append(next_rec.pop()) def update_terminal(): w.clear() for i in range(0, max_y): w.move(i, 0) if not current_y + i < records_len: break record = records[current_y + i] color = record.get_color() if color: w.attrset(curses.color_pair(color)) w.addstr(record.format_line(max_x)) if color: w.attroff(curses.color_pair(color)) w.refresh() if __name__ == '__main__': @atexit.register def exit_handler(): curses.endwin() if dying_msg != '': print dying_msg + '\n' init_curses() procs = map(lambda x: Process(x), sys.argv[1:]) assign_color(procs) unify_records(procs) records_len = len(records) tty_file = open('/dev/tty', 'rb') max_y, max_x = w.getmaxyx() update_terminal() running = True while running: try: key = tty_file.read(1) except IOError, (enr, msg): if enr == errno.EINTR: continue dying_msg = 'fatal error: %s' % \ (os.strerror(enr)) break if key == 'q': break elif key == 'j': if current_y + 1 < records_len: current_y += 1 elif key == 'k': if current_y: current_y -= 1 elif key == ' ': if current_y + max_y < records_len: current_y += max_y elif key == 'g': current_y = 0 elif key == 'G': current_y = records_len - max_y update_terminal() sheepdog-0.7.5/script/sheepdog.in000077500000000000000000000050751223630776600170010ustar00rootroot00000000000000#!/bin/bash # chkconfig: - 21 79 # description: Sheepdog # processname: sheep # ### BEGIN INIT INFO # Provides: sheepdog # Required-Start: $network # Should-Start: $syslog # Required-Stop: $network # Default-Start: # Default-Stop: # Short-Description: Starts and stops Sheepdog. # Description: Starts and stops Sheepdog. ### END INIT INFO desc="Sheepdog QEMU/KVM Block Storage" prog="sheep" # set secure PATH PATH="/sbin:/bin:/usr/sbin:/usr/bin:@SBINDIR@" SHEEPDOGD=@SBINDIR@/sheep success() { echo -ne "[ OK ]\r" } failure() { echo -ne "[FAILED]\r" } status() { pid=$(pidof $1 2>/dev/null) rtrn=$? if [ $rtrn -ne 0 ]; then echo "$1 is stopped" else echo "$1 (pid $pid) is running..." fi return $rtrn } # rpm based distros if [ -d @SYSCONFDIR@/sysconfig ]; then [ -f @INITDDIR@/functions ] && . @INITDDIR@/functions [ -f @SYSCONFDIR@/sysconfig/$prog ] && . @SYSCONFDIR@/sysconfig/$prog [ -z "$LOCK_FILE" ] && LOCK_FILE="@LOCALSTATEDIR@/lock/subsys/$prog" fi # deb based distros if [ -d @SYSCONFDIR@/default ]; then [ -f @SYSCONFDIR@/default/$prog ] && . @SYSCONFDIR@/default/$prog [ -z "$LOCK_FILE" ] && LOCK_FILE="@LOCALSTATEDIR@/lock/$prog" fi # The version of __pids_pidof in /etc/init.d/functions calls pidof with -x # This means it matches scripts, including this one. # Redefine it here so that status (from the same file) works. # Otherwise simultaneous calls to stop() will loop forever __pids_pidof() { pidof -c -o $$ -o $PPID -o %PPID "$1" || \ pidof -c -o $$ -o $PPID -o %PPID "${1##*/}" } start() { echo -n "Starting $desc ($prog): " # most recent distributions use tmpfs for @LOCALSTATEDIR@/run # to avoid to clean it up on every boot. # they also assume that init scripts will create # required subdirectories for proper operations mkdir -p @LOCALSTATEDIR@/run if status $prog > /dev/null 2>&1; then success else $prog -p 7000 @LOCALSTATEDIR@/lib/sheepdog > /dev/null 2>&1 # give it time to fail sleep 2 if status $prog > /dev/null 2>&1; then touch $LOCK_FILE success else failure rtrn=1 fi fi echo } stop() { ! status $prog > /dev/null 2>&1 && return echo -n "Stopping $desc ($prog): " killproc $prog RETVAL=$? rm -f $LOCK_FILE success echo } restart() { stop start } rtrn=0 case "$1" in start) start ;; restart|reload|force-reload) restart ;; condrestart|try-restart) if status $prog > /dev/null 2>&1; then restart fi ;; status) status $prog rtrn=$? ;; stop) stop ;; *) echo "usage: $0 {start|stop|restart|reload|force-reload|condrestart|try-restart|status}" rtrn=2 ;; esac exit $rtrn sheepdog-0.7.5/script/vditest000077500000000000000000000226031223630776600162540ustar00rootroot00000000000000#!/usr/bin/perl # # Copyright (C) 2010 MORITA Kazutaka # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License version # 2 as published by the Free Software Foundation. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # use feature 'switch'; use strict; use Getopt::Std; use Time::HiRes qw(gettimeofday); use IPC::Open2; my $program = "vditest"; my ($vdiname, $vdisize); my $concurrency = 1; my $nr_outstanding_aio = 0; my ($lblk, $hblk) = (512, 1048576); my $cache = 'writethrough'; my $runtime = 10; my ($rrate, $wrate) = (100, 0); my $no_act = 0; my $offset = 0; my $seek_pattern = "linear"; my $seed = time(); my ($sblk, $eblk) = (0, 0); my $file = 0; my $flush_interval = 0; my $verbose = 0; my ($read_test, $write_test) = (0,0); my $hbeat = 0; my ($rd_bytes, $wr_bytes, $rd_ops, $wr_ops) = (0, 0, 0, 0); my ($total_rd_bytes, $total_wr_bytes, $total_rd_ops, $total_wr_ops) = (0, 0, 0, 0); $/ = 'qemu-io> '; parse(); print_options(); vdi_open($vdiname, $cache); vdi_main(); vdi_flush(); vdi_close(); sub get_aligned_blk { my ($l, $h) = @_; return $l + 512 * int(rand($h - $l + 512) / 512); } sub to_bytes { my ($size) = @_; given ($size) { when (/k(i?b)?$/i) { $size *= 1024; } when (/m(i?b)?$/i) { $size *= 1024 ** 2; } when (/g(i?b)?$/i) { $size *= 1024 ** 3; } } $_[0] = $size; } sub to_str { my ($size) = @_; my @units = ("", "K", "M", "G", "T", "P", "E", "Z", "Y"); while ($size >= 1024) { shift @units; $size /= 1024; } return sprintf "%.1f%s", $size, $units[0]; } sub print_options { my $opt = "options: "; $opt .= "-B $lblk:$hblk "; $opt .= "-c $cache "; $opt .= "-C $concurrency "; $opt .= "-D $rrate:$wrate "; $opt .= "-n " if $no_act; $opt .= "-o $offset\n"; $opt .= " "; $opt .= "-p $seek_pattern "; $opt .= "-s $seed "; $opt .= "-S $sblk:$eblk "; $opt .= "-T $runtime "; $opt .= "-f $flush_interval\n"; print $opt; } sub print_qemu { my ($cmd) = @_; print $cmd if $verbose; print QEMU $cmd if !$no_act; my $result = ; if ($verbose) { $result =~ s/qemu-io> //; print $result; } while ($result =~ /wrote|read/g) { $nr_outstanding_aio--; } } sub wait_aio_requests { my $old_sep = $/; $/ = "\n"; my $result = ; if ($verbose) { print $result; } while ($result =~ /wrote|read/g) { $nr_outstanding_aio--; } $/ = $old_sep; } sub vdi_open { my ($vdiname, $cache) = @_; my $cmd; return if $no_act; if ($file) { $cmd = "stdbuf -o0 qemu-io -t $cache $vdiname"; } else { $cmd = "stdbuf -o0 qemu-io -t $cache sheepdog:$vdiname"; } open2 *QEMU_OUT, *QEMU, $cmd or die "cannot run qemu-io" if !$no_act; ; } sub vdi_close { print_qemu("quit\n"); close QEMU if !$no_act; } sub vdi_read { my ($offset, $length) = @_; print_qemu("aio_read $offset $length\n"); $nr_outstanding_aio++; $rd_ops++; $rd_bytes += $length; $total_rd_ops++; $total_rd_bytes += $length; } sub vdi_write { my ($offset, $length) = @_; print_qemu("aio_write $offset $length\n"); $nr_outstanding_aio++; $wr_ops++; $wr_bytes += $length; $total_wr_ops++; $total_wr_bytes += $length; } sub vdi_flush { print_qemu("aio_flush\n"); } sub parse_opts { my %opts = (); getopts("?B:c:C:D:f:Fh:no:p:rs:S:T:vw", \%opts) or help(1); foreach my $key (keys %opts) { my $val = $opts{$key}; given ($key) { when ('?') { help(0); } when ('B') { ($lblk, $hblk) = ($val =~ /(\d+[kmg]?):?(\d*[kmg]?)/i); to_bytes($lblk); to_bytes($hblk); $hblk = $lblk if $hblk == 0; die "$lblk is not sector aligned" if $lblk % 512 != 0; die "$lblk is not valid" if $lblk == 0; die "$hblk is not sector aligned" if $hblk % 512 != 0; die "$hblk is too large" if $lblk > (64 * 1024 ** 2); die "transfer range is invalid" if $lblk > $hblk; } when ('c') { if ($val !~ /(none|write(back|through))/) { die "'$val' is not valid"; } $cache = $val; } when ('C') { die "'$val' is not valid" if $val <= 0; $concurrency = $val; } when ('D') { ($rrate, $wrate) = ($val =~ /(\d+)\%?:?(\d*)\%?/); } when ('f') { $flush_interval = $val; } when ('F') { $file = 1; } when ('h') { die "'$val' is not valid" if $val <= 0; $hbeat = $val; } when ('n') { $no_act = 1; $verbose = 1; } when ('o') { die "'$val' is not valid" if $val < 0; $offset = $val; } when ('p') { if ($val =~ /^l/) { $seek_pattern = "linear"; } elsif ($val =~ /^r/) { $seek_pattern = "random"; } else { die "'$val' is not valid"; } } when ('r') { $read_test = 1; if ($write_test) { ($rrate, $wrate) = (50, 50); } else { ($rrate, $wrate) = (100, 0); } } when ('s') { $seed = $val; } when ('S') { ($sblk, $eblk) = ($val =~ /(\d+[kmg]?):?(\d*[kmg]?)/i); to_bytes($sblk); to_bytes($eblk); die "$sblk is not sector aligned" if $sblk % 512 != 0; die "$eblk is not sector aligned" if $eblk % 512 != 0; } when ('T') { die "'$val' is not valid" if $val < 0; $runtime = $val; } when ('v') { $verbose = 1; } when ('w') { $write_test = 1; if ($read_test) { ($rrate, $wrate) = (50, 50); } else { ($rrate, $wrate) = (0, 100); } } } } } sub parse { parse_opts(); if (@ARGV == 0) { die "vdiname must be specified"; } else { $vdiname = shift @ARGV; # process the rest of options parse_opts() if (@ARGV > 0); } die "too many arguments" if @ARGV > 0; if ($file) { $vdisize = `qemu-io -c length $vdiname`; } else { $vdisize = `qemu-io -c length sheepdog:$vdiname`; } to_bytes($vdisize); die "cannot get vdi size" if $vdisize == 0; $eblk = $vdisize if $eblk == 0; die "test block range is invalid" if $sblk >= $eblk; die "transfer size is too large" if $hblk > $eblk - $sblk; } sub vdi_main { my $roffset = $offset; my $woffset = $offset; my ($cur_time, $start_time, $end_time, $hbeat_time); $start_time = $cur_time = get_current_time(); $hbeat_time = $start_time + $hbeat * 1000000; $end_time = $start_time + $runtime * 1000000; srand($seed); while ($cur_time < $end_time) { my $length = get_aligned_blk($lblk, $hblk); while ($nr_outstanding_aio >= $concurrency) { wait_aio_requests(); } if (rand($rrate + $wrate) < $rrate) { # read $length = $eblk - $roffset if $roffset + $length > $eblk; vdi_read($roffset, $length); if ($seek_pattern eq 'linear') { $roffset += $length; $roffset -= $eblk - $sblk while $roffset >= $eblk; } else { $roffset = get_aligned_blk($sblk, $eblk - 512); } } else { # write $length = $eblk - $woffset if $woffset + $length > $eblk; vdi_write($woffset, $length); if ($seek_pattern eq 'linear') { $woffset += $length; $woffset -= $eblk - $sblk while $woffset >= $eblk; } else { $woffset = get_aligned_blk($sblk, $eblk - 512); } if ($flush_interval > 0 && $wr_ops % $flush_interval == 0) { vdi_flush(); } } $cur_time = get_current_time(); if ($hbeat > 0 && $hbeat_time <= $cur_time) { print_result('Heartbeat read', $rd_bytes, $rd_ops, $hbeat) if $rrate; print_result('Heartbeat write', $wr_bytes, $wr_ops, $hbeat) if $wrate; $rd_ops = $wr_ops = 0; $rd_bytes = $wr_bytes = 0; $hbeat_time += $hbeat * 1000000; } } print_result('Total read', $total_rd_bytes, $total_rd_ops, $runtime) if $rrate; print_result('Total write', $total_wr_bytes, $total_wr_ops, $runtime) if $wrate; } sub get_current_time { my ($sec, $microsec) = gettimeofday(); return $sec * 1000000 + $microsec; } sub print_result { my ($label, $bytes, $ops, $t) = @_; printf "$label throughput: %.1fB/s (%s/s), IOPS %.1f/s.\n", $bytes / $t, to_str($bytes / $t), $ops / $t; } sub help { my ($status) = @_; print < seconds. -n print events that would occur but do not access disk. -o offset set the start offset. -p seek_pattern set the pattern of disk seeks. seek_pattern is "linear" or "random". -r read data from vdi. -s seed set seed for random number generation. -S sblk[:eblk] set the start [and stop] test block. -T runtime run until seconds have elapsed. -v verbose mode. -w write data to vdi. END_OF_HELP exit($status); } sheepdog-0.7.5/sheep/000077500000000000000000000000001223630776600144415ustar00rootroot00000000000000sheepdog-0.7.5/sheep/Makefile.am000066400000000000000000000035131223630776600164770ustar00rootroot00000000000000# # Copyright 2010 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; see the file COPYING. If not, write to # the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. # MAINTAINERCLEANFILES = Makefile.in AM_CFLAGS = AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include \ $(libcpg_CFLAGS) $(libcfg_CFLAGS) $(libacrd_CFLAGS) sbin_PROGRAMS = sheep sheep_SOURCES = sheep.c group.c request.c gateway.c store.c vdi.c \ journal.c ops.c recovery.c cluster/local.c \ object_cache.c object_list_cache.c \ plain_store.c config.c migrate.c md.c \ cluster/shepherd.c if BUILD_HTTP sheep_SOURCES += http.c endif if BUILD_COROSYNC sheep_SOURCES += cluster/corosync.c endif if BUILD_ZOOKEEPER sheep_SOURCES += cluster/zookeeper.c endif if BUILD_TRACE sheep_SOURCES += trace/trace.c trace/mcount.S trace/graph.c trace/checker.c endif sheep_LDADD = ../lib/libsheepdog.a -lpthread -lm\ $(libcpg_LIBS) $(libcfg_LIBS) $(libacrd_LIBS) $(LIBS) sheep_DEPENDENCIES = ../lib/libsheepdog.a noinst_HEADERS = sheep_priv.h cluster.h trace/trace.h EXTRA_DIST = all-local: @echo Built sheep clean-local: rm -f sheep *.o gmon.out *.da *.bb *.bbg # support for GNU Flymake check-syntax: $(COMPILE) -fsyntax-only $(CHK_SOURCES) check-style: @$(CHECK_STYLE) $(sheep_SOURCES) $(noinst_HEADERS) coverage: @lcov -d . -c -o sheep.info sheepdog-0.7.5/sheep/cluster.h000066400000000000000000000113251223630776600162750ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef __CLUSTER_H__ #define __CLUSTER_H__ #include #include #include #include #include #include #include #include "sheepdog_proto.h" #include "sheep.h" #include "config.h" /* maximum payload size sent in ->notify and ->unblock */ #define SD_MAX_EVENT_BUF_SIZE (128 * 1024) /* 128k */ struct cluster_driver { const char *name; /* * Initialize the cluster driver * * Returns zero on success, -1 on error. */ int (*init)(const char *option); /* * Get a node ID for this sheep. * * Gets and ID that is used in all communication with other sheep, * which normally would be a string formatted IP address. * * Returns zero on success, -1 on error. */ int (*get_local_addr)(uint8_t *myaddr); /* * Join the cluster * * This function is used to join the cluster, and notifies a join * event to all the nodes. The copy of 'opaque' is passed to * sd_join_handler() and sd_accept_handler(). * * sd_join_handler() must be called on at least one node which already * paticipates in the cluster. If the content of 'opaque' is changed in * sd_join_handler(), the updated 'opaque' must be passed to * sd_accept_handler(). * * Returns zero on success, -1 on error */ int (*join)(const struct sd_node *myself, void *opaque, size_t opaque_len); /* * Leave the cluster * * This function is used to leave the cluster, and notifies a * leave event to all the nodes. The cluster driver calls event * handlers even after this function is called, so the left node can * work as a gateway. * * Returns zero on success, -1 on error */ int (*leave)(void); /* * Notify a message to all nodes in the cluster * * This function sends 'msg' to all the nodes. The notified messages * can be read through sd_notify_handler() and totally ordered with * node change events. * * Returns SD_RES_XXX */ int (*notify)(void *msg, size_t msg_len); /* * Send a message to all nodes to block further events. * * Once the cluster driver has ensured that events are blocked on all * nodes it needs to call sd_block_handler() on the node where ->block * was called. * * Returns SD_RES_XXX */ int (*block)(void); /* * Unblock events on all nodes, and send a total order message * to all nodes. * * Returns SD_RES_XXX */ int (*unblock)(void *msg, size_t msg_len); /* * Update the specific node in the driver's private copy of nodes * * Returns SD_RES_XXX */ int (*update_node)(struct sd_node *); struct list_head list; }; extern struct list_head cluster_drivers; #ifdef HAVE_COROSYNC #define DEFAULT_CLUSTER_DRIVER "corosync" #else #define DEFAULT_CLUSTER_DRIVER "local" #endif /* HAVE_COROSYNC */ #define cdrv_register(driver) \ static void __attribute__((constructor)) regist_ ## driver(void) \ { \ if (!driver.init || !driver.join || !driver.leave || !driver.notify) \ panic("the driver '%s' is incomplete", driver.name); \ list_add(&driver.list, &cluster_drivers); \ } #define FOR_EACH_CLUSTER_DRIVER(driver) \ list_for_each_entry(driver, &cluster_drivers, list) static inline struct cluster_driver *find_cdrv(const char *name) { struct cluster_driver *cdrv; int len; FOR_EACH_CLUSTER_DRIVER(cdrv) { len = strlen(cdrv->name); if (strncmp(cdrv->name, name, len) == 0 && (name[len] == ':' || name[len] == '\0')) return cdrv; } return NULL; } static inline const char *get_cdrv_option(const struct cluster_driver *cdrv, const char *arg) { int len = strlen(cdrv->name); if (arg[len] == ':') return strdup(arg + len + 1); else return NULL; } /* callbacks back into sheepdog from the cluster drivers */ void sd_accept_handler(const struct sd_node *joined, const struct sd_node *members, size_t nr_members, const void *opaque); void sd_leave_handler(const struct sd_node *left, const struct sd_node *members, size_t nr_members); void sd_notify_handler(const struct sd_node *sender, void *msg, size_t msg_len); bool sd_block_handler(const struct sd_node *sender); int sd_reconnect_handler(void); void sd_update_node_handler(struct sd_node *); bool sd_join_handler(const struct sd_node *joining, const struct sd_node *nodes, size_t nr_nodes, void *opaque); void recalculate_vnodes(struct sd_node *nodes, int nr_nodes); #endif sheepdog-0.7.5/sheep/cluster/000077500000000000000000000000001223630776600161225ustar00rootroot00000000000000sheepdog-0.7.5/sheep/cluster/corosync.c000066400000000000000000000444711223630776600201370ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include "cluster.h" #include "event.h" #include "work.h" #define CPG_INIT_RETRY_CNT 10 struct cpg_node { uint32_t nodeid; uint32_t pid; struct sd_node node; }; static cpg_handle_t cpg_handle; static struct cpg_name cpg_group = { 8, "sheepdog" }; static corosync_cfg_handle_t cfg_handle; static struct cpg_node this_node; static LIST_HEAD(corosync_block_event_list); static LIST_HEAD(corosync_nonblock_event_list); static struct cpg_node cpg_nodes[SD_MAX_NODES]; static size_t nr_cpg_nodes; static bool self_elect; static bool join_finished; static int cpg_fd; static size_t nr_majority; /* used for network partition detection */ /* event types which are dispatched in corosync_dispatch() */ enum corosync_event_type { COROSYNC_EVENT_TYPE_JOIN, COROSYNC_EVENT_TYPE_ACCEPT, COROSYNC_EVENT_TYPE_LEAVE, COROSYNC_EVENT_TYPE_BLOCK, COROSYNC_EVENT_TYPE_NOTIFY, COROSYNC_EVENT_TYPE_UPDATE_NODE, }; /* multicast message type */ enum corosync_message_type { COROSYNC_MSG_TYPE_JOIN, COROSYNC_MSG_TYPE_ACCEPT, COROSYNC_MSG_TYPE_LEAVE, COROSYNC_MSG_TYPE_NOTIFY, COROSYNC_MSG_TYPE_BLOCK, COROSYNC_MSG_TYPE_UNBLOCK, COROSYNC_MSG_TYPE_UPDATE_NODE, }; struct corosync_event { enum corosync_event_type type; struct cpg_node sender; void *msg; size_t msg_len; uint32_t nr_nodes; struct cpg_node nodes[SD_MAX_NODES]; bool callbacked; struct list_head list; }; struct corosync_message { struct cpg_node sender; enum corosync_message_type type:16; uint16_t nr_nodes; uint32_t msg_len; struct cpg_node nodes[SD_MAX_NODES]; uint8_t msg[0]; }; static int cpg_node_cmp(struct cpg_node *a, struct cpg_node *b) { int cmp = intcmp(a->nodeid, b->nodeid); if (cmp == 0) cmp = intcmp(a->pid, b->pid); return cmp; } static bool cpg_node_equal(struct cpg_node *a, struct cpg_node *b) { return cpg_node_cmp(a, b) == 0; } static inline int find_sd_node(struct cpg_node *nodes, size_t nr_nodes, struct sd_node *key) { int i; for (i = 0; i < nr_nodes; i++) if (node_eq(&nodes[i].node, key)) return i; return -1; } static inline void add_cpg_node(struct cpg_node *nodes, size_t nr_nodes, struct cpg_node *added) { nodes[nr_nodes++] = *added; } static inline void del_cpg_node(struct cpg_node *nodes, size_t nr_nodes, struct cpg_node *deled) { xlremove(deled, nodes, &nr_nodes, cpg_node_cmp); } static int corosync_get_local_addr(uint8_t *addr) { int ret, nr; corosync_cfg_node_address_t caddr; struct sockaddr_storage *ss = (struct sockaddr_storage *)caddr.address; struct sockaddr_in *sin = (struct sockaddr_in *)caddr.address; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)caddr.address; void *saddr; ret = corosync_cfg_get_node_addrs(cfg_handle, this_node.nodeid, 1, &nr, &caddr); if (ret != CS_OK) { sd_err("failed to get node addresses (%d)", ret); return -1; } if (!nr) { sd_err("no node addresses found"); return -1; } if (ss->ss_family == AF_INET6) { saddr = &sin6->sin6_addr; memcpy(addr, saddr, 16); } else if (ss->ss_family == AF_INET) { saddr = &sin->sin_addr; memset(addr, 0, 16); memcpy(addr + 12, saddr, 4); } else { sd_err("unknown protocol %d", ss->ss_family); return -1; } return 0; } static int send_message(enum corosync_message_type type, struct cpg_node *sender, struct cpg_node *nodes, size_t nr_nodes, void *msg, size_t msg_len) { struct iovec iov[2]; int ret, iov_cnt = 1; struct corosync_message cmsg = { .type = type, .msg_len = msg_len, .sender = *sender, .nr_nodes = nr_nodes, }; if (nodes) memcpy(cmsg.nodes, nodes, sizeof(*nodes) * nr_nodes); iov[0].iov_base = &cmsg; iov[0].iov_len = sizeof(cmsg); if (msg) { iov[1].iov_base = msg; iov[1].iov_len = msg_len; iov_cnt++; } retry: ret = cpg_mcast_joined(cpg_handle, CPG_TYPE_AGREED, iov, iov_cnt); switch (ret) { case CS_OK: break; case CS_ERR_TRY_AGAIN: sd_debug("failed to send message: retrying"); sleep(1); goto retry; default: sd_err("failed to send message (%d)", ret); return SD_RES_CLUSTER_ERROR; } return SD_RES_SUCCESS; } static inline struct corosync_event * find_block_event(enum corosync_event_type type, struct cpg_node *sender) { struct corosync_event *cevent; list_for_each_entry(cevent, &corosync_block_event_list, list) { if (cevent->type == type && cpg_node_equal(&cevent->sender, sender)) return cevent; } return NULL; } static inline struct corosync_event * find_nonblock_event(enum corosync_event_type type, struct cpg_node *sender) { struct corosync_event *cevent; list_for_each_entry(cevent, &corosync_nonblock_event_list, list) { if (cevent->type == type && cpg_node_equal(&cevent->sender, sender)) return cevent; } return NULL; } static inline struct corosync_event * find_event(enum corosync_event_type type, struct cpg_node *sender) { if (type == COROSYNC_EVENT_TYPE_BLOCK) return find_block_event(type, sender); else return find_nonblock_event(type, sender); } static void build_node_list(const struct cpg_node *nodes, size_t nr_nodes, struct sd_node *entries) { int i; for (i = 0; i < nr_nodes; i++) entries[i] = nodes[i].node; } /* * Process one dispatch event * * Returns true if the event is processed */ static bool __corosync_dispatch_one(struct corosync_event *cevent) { struct sd_node entries[SD_MAX_NODES], *node; struct cpg_node *n; int idx; switch (cevent->type) { case COROSYNC_EVENT_TYPE_JOIN: if (!cevent->msg) /* we haven't receive JOIN yet */ return false; if (cevent->callbacked) /* sd_join_handler() must be called only once */ return false; build_node_list(cpg_nodes, nr_cpg_nodes, entries); if (sd_join_handler(&cevent->sender.node, entries, nr_cpg_nodes, cevent->msg)) { send_message(COROSYNC_MSG_TYPE_ACCEPT, &cevent->sender, cpg_nodes, nr_cpg_nodes, cevent->msg, cevent->msg_len); cevent->callbacked = true; } return false; case COROSYNC_EVENT_TYPE_ACCEPT: add_cpg_node(cpg_nodes, nr_cpg_nodes, &cevent->sender); nr_cpg_nodes++; build_node_list(cpg_nodes, nr_cpg_nodes, entries); sd_accept_handler(&cevent->sender.node, entries, nr_cpg_nodes, cevent->msg); break; case COROSYNC_EVENT_TYPE_LEAVE: n = xlfind(&cevent->sender, cpg_nodes, nr_cpg_nodes, cpg_node_cmp); if (n == NULL) break; cevent->sender.node = n->node; del_cpg_node(cpg_nodes, nr_cpg_nodes, &cevent->sender); nr_cpg_nodes--; build_node_list(cpg_nodes, nr_cpg_nodes, entries); sd_leave_handler(&cevent->sender.node, entries, nr_cpg_nodes); break; case COROSYNC_EVENT_TYPE_BLOCK: if (cevent->callbacked) /* * block events until the unblock message * removes this event */ return false; cevent->callbacked = sd_block_handler(&cevent->sender.node); return false; case COROSYNC_EVENT_TYPE_NOTIFY: sd_notify_handler(&cevent->sender.node, cevent->msg, cevent->msg_len); break; case COROSYNC_EVENT_TYPE_UPDATE_NODE: node = &cevent->sender.node; if (cpg_node_equal(&cevent->sender, &this_node)) this_node = cevent->sender; idx = find_sd_node(cpg_nodes, nr_cpg_nodes, node); assert(idx >= 0); cpg_nodes[idx].node = *node; sd_update_node_handler(node); break; } return true; } static bool update_join_status(struct corosync_event *cevent) { if (join_finished) return true; switch (cevent->type) { case COROSYNC_EVENT_TYPE_JOIN: if (self_elect) { nr_cpg_nodes = 0; return true; } break; case COROSYNC_EVENT_TYPE_ACCEPT: if (cpg_node_equal(&cevent->sender, &this_node)) { nr_cpg_nodes = cevent->nr_nodes; memcpy(cpg_nodes, cevent->nodes, sizeof(*cevent->nodes) * cevent->nr_nodes); return true; } break; default: break; } return false; } static void __corosync_dispatch(void) { struct corosync_event *cevent; struct pollfd pfd = { .fd = cpg_fd, .events = POLLIN, }; if (poll(&pfd, 1, 0)) { /* * Corosync dispatches leave events one by one even * when network partition has occured. To count the * number of alive nodes correctly, we postpone * processsing events if there are incoming ones. */ sd_debug("wait for a next dispatch event"); return; } nr_majority = 0; while (!list_empty(&corosync_block_event_list) || !list_empty(&corosync_nonblock_event_list)) { if (!list_empty(&corosync_nonblock_event_list)) cevent = list_first_entry(&corosync_nonblock_event_list, typeof(*cevent), list); else cevent = list_first_entry(&corosync_block_event_list, typeof(*cevent), list); join_finished = update_join_status(cevent); if (join_finished) { if (!__corosync_dispatch_one(cevent)) return; } else { switch (cevent->type) { case COROSYNC_MSG_TYPE_JOIN: case COROSYNC_MSG_TYPE_BLOCK: return; default: break; } } list_del(&cevent->list); free(cevent->msg); free(cevent); } } static struct corosync_event * update_event(enum corosync_event_type type, struct cpg_node *sender, void *msg, size_t msg_len) { struct corosync_event *cevent; cevent = find_event(type, sender); if (!cevent) /* block message was casted before this node joins */ return NULL; cevent->msg_len = msg_len; if (msg_len) { cevent->msg = realloc(cevent->msg, msg_len); if (!cevent->msg) panic("failed to allocate memory"); memcpy(cevent->msg, msg, msg_len); } else { free(cevent->msg); cevent->msg = NULL; } return cevent; } static void queue_event(struct corosync_event *cevent) { if (cevent->type == COROSYNC_EVENT_TYPE_BLOCK) list_add_tail(&cevent->list, &corosync_block_event_list); else list_add_tail(&cevent->list, &corosync_nonblock_event_list); } static void cdrv_cpg_deliver(cpg_handle_t handle, const struct cpg_name *group_name, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { struct corosync_event *cevent; struct corosync_message *cmsg = msg; sd_debug("%d", cmsg->type); switch (cmsg->type) { case COROSYNC_MSG_TYPE_JOIN: cevent = update_event(COROSYNC_EVENT_TYPE_JOIN, &cmsg->sender, cmsg->msg, cmsg->msg_len); if (!cevent) break; cevent->sender = cmsg->sender; cevent->msg_len = cmsg->msg_len; break; case COROSYNC_MSG_TYPE_UNBLOCK: cevent = update_event(COROSYNC_EVENT_TYPE_BLOCK, &cmsg->sender, cmsg->msg, cmsg->msg_len); if (cevent) { list_del(&cevent->list); free(cevent->msg); free(cevent); } /* fall through */ case COROSYNC_MSG_TYPE_BLOCK: case COROSYNC_MSG_TYPE_NOTIFY: case COROSYNC_MSG_TYPE_UPDATE_NODE: cevent = xzalloc(sizeof(*cevent)); switch (cmsg->type) { case COROSYNC_MSG_TYPE_BLOCK: cevent->type = COROSYNC_EVENT_TYPE_BLOCK; break; case COROSYNC_MSG_TYPE_UPDATE_NODE: cevent->type = COROSYNC_EVENT_TYPE_UPDATE_NODE; break; default: cevent->type = COROSYNC_EVENT_TYPE_NOTIFY; break; } cevent->sender = cmsg->sender; cevent->msg_len = cmsg->msg_len; if (cmsg->msg_len) { cevent->msg = xzalloc(cmsg->msg_len); memcpy(cevent->msg, cmsg->msg, cmsg->msg_len); } else cevent->msg = NULL; queue_event(cevent); break; case COROSYNC_MSG_TYPE_LEAVE: cevent = xzalloc(sizeof(*cevent)); cevent->type = COROSYNC_EVENT_TYPE_LEAVE; cevent->sender = cmsg->sender; cevent->msg_len = cmsg->msg_len; if (cmsg->msg_len) { cevent->msg = xzalloc(cmsg->msg_len); memcpy(cevent->msg, cmsg->msg, cmsg->msg_len); } else cevent->msg = NULL; queue_event(cevent); break; case COROSYNC_MSG_TYPE_ACCEPT: cevent = update_event(COROSYNC_EVENT_TYPE_JOIN, &cmsg->sender, cmsg->msg, cmsg->msg_len); if (!cevent) break; cevent->type = COROSYNC_EVENT_TYPE_ACCEPT; cevent->nr_nodes = cmsg->nr_nodes; memcpy(cevent->nodes, cmsg->nodes, sizeof(*cmsg->nodes) * cmsg->nr_nodes); break; } __corosync_dispatch(); } static void build_cpg_node_list(struct cpg_node *nodes, const struct cpg_address *list, size_t nr) { int i; for (i = 0; i < nr; i++) { nodes[i].nodeid = list[i].nodeid; nodes[i].pid = list[i].pid; } } static void cdrv_cpg_confchg(cpg_handle_t handle, const struct cpg_name *group_name, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries) { struct corosync_event *cevent; int i; struct cpg_node member_sheep[SD_MAX_NODES]; struct cpg_node joined_sheep[SD_MAX_NODES]; struct cpg_node left_sheep[SD_MAX_NODES]; bool promote = true; sd_debug("mem:%zu, joined:%zu, left:%zu", member_list_entries, joined_list_entries, left_list_entries); /* check network partition */ if (left_list_entries) { if (nr_majority == 0) { size_t total = member_list_entries + left_list_entries; /* * we need at least 3 nodes to handle network * partition failure */ if (total > 2) nr_majority = total / 2 + 1; } if (member_list_entries == 0) panic("NIC failure?"); if (member_list_entries < nr_majority) panic("Network partition is detected"); } /* convert cpg_address to cpg_node */ build_cpg_node_list(member_sheep, member_list, member_list_entries); build_cpg_node_list(left_sheep, left_list, left_list_entries); build_cpg_node_list(joined_sheep, joined_list, joined_list_entries); /* dispatch leave_handler */ for (i = 0; i < left_list_entries; i++) { cevent = find_event(COROSYNC_EVENT_TYPE_JOIN, left_sheep + i); if (cevent) { /* the node left before joining */ list_del(&cevent->list); free(cevent->msg); free(cevent); continue; } cevent = find_event(COROSYNC_EVENT_TYPE_BLOCK, left_sheep + i); if (cevent) { /* the node left before sending UNBLOCK */ list_del(&cevent->list); free(cevent->msg); free(cevent); } cevent = xzalloc(sizeof(*cevent)); cevent->type = COROSYNC_EVENT_TYPE_LEAVE; cevent->sender = left_sheep[i]; queue_event(cevent); } /* dispatch join_handler */ for (i = 0; i < joined_list_entries; i++) { cevent = xzalloc(sizeof(*cevent)); cevent->type = COROSYNC_EVENT_TYPE_JOIN; cevent->sender = joined_sheep[i]; queue_event(cevent); } if (!join_finished) { /* * Exactly one non-master member has seen join events for * all other members, because events are ordered. */ for (i = 0; i < member_list_entries; i++) { cevent = find_event(COROSYNC_EVENT_TYPE_JOIN, &member_sheep[i]); if (!cevent) { sd_debug("Not promoting because member is not " "in our event list."); promote = false; break; } } /* * If we see the join events for all nodes promote ourself to * master right here. */ if (promote) self_elect = true; } __corosync_dispatch(); } static int corosync_join(const struct sd_node *myself, void *opaque, size_t opaque_len) { int ret; retry: ret = cpg_join(cpg_handle, &cpg_group); switch (ret) { case CS_OK: break; case CS_ERR_TRY_AGAIN: sd_debug("failed to join the sheepdog group: retrying"); sleep(1); goto retry; case CS_ERR_SECURITY: sd_err("permission denied to join the sheepdog group"); return -1; default: sd_err("failed to join the sheepdog group (%d)", ret); return -1; } this_node.node = *myself; ret = send_message(COROSYNC_MSG_TYPE_JOIN, &this_node, NULL, 0, opaque, opaque_len); return ret; } static int corosync_leave(void) { return send_message(COROSYNC_MSG_TYPE_LEAVE, &this_node, NULL, 0, NULL, 0); } static int corosync_block(void) { return send_message(COROSYNC_MSG_TYPE_BLOCK, &this_node, NULL, 0, NULL, 0); } static int corosync_unblock(void *msg, size_t msg_len) { return send_message(COROSYNC_MSG_TYPE_UNBLOCK, &this_node, NULL, 0, msg, msg_len); } static int corosync_notify(void *msg, size_t msg_len) { return send_message(COROSYNC_MSG_TYPE_NOTIFY, &this_node, NULL, 0, msg, msg_len); } static void corosync_handler(int listen_fd, int events, void *data) { int ret; if (events & EPOLLHUP) { sd_err("corosync driver received EPOLLHUP event, exiting."); goto out; } ret = cpg_dispatch(cpg_handle, CS_DISPATCH_ALL); if (ret != CS_OK) { sd_err("cpg_dispatch returned %d", ret); goto out; } return; out: log_close(); exit(1); } static int corosync_init(const char *option) { int ret, retry_cnt = 0; uint32_t nodeid; cpg_callbacks_t cb = { .cpg_deliver_fn = cdrv_cpg_deliver, .cpg_confchg_fn = cdrv_cpg_confchg }; again: ret = cpg_initialize(&cpg_handle, &cb); switch (ret) { case CS_OK: /* success */ break; case CS_ERR_TRY_AGAIN: if (retry_cnt++ == CPG_INIT_RETRY_CNT) { sd_err("failed to initialize cpg (%d) - " "is corosync running?", ret); return -1; } sd_debug("retry cpg_initialize"); usleep(200000); goto again; case CS_ERR_SECURITY: sd_err("failed to initialize cpg - permission denied"); return -1; default: sd_err("failed to initialize cpg (%d)", ret); return -1; } ret = corosync_cfg_initialize(&cfg_handle, NULL); if (ret != CS_OK) { sd_err("failed to initialize cfg (%d)", ret); return -1; } ret = corosync_cfg_local_get(cfg_handle, &nodeid); if (ret != CS_OK) { sd_err("failed to get node id (%d)", ret); return -1; } this_node.nodeid = nodeid; this_node.pid = getpid(); ret = cpg_fd_get(cpg_handle, &cpg_fd); if (ret != CS_OK) { sd_err("failed to get cpg file descriptor (%d)", ret); return -1; } ret = register_event(cpg_fd, corosync_handler, NULL); if (ret) { sd_err("failed to register corosync event handler (%d)", ret); return -1; } return 0; } static int corosync_update_node(struct sd_node *node) { struct cpg_node cnode = this_node; cnode.node = *node; return send_message(COROSYNC_MSG_TYPE_UPDATE_NODE, &cnode, NULL, 0, NULL, 0); } static struct cluster_driver cdrv_corosync = { .name = "corosync", .init = corosync_init, .get_local_addr = corosync_get_local_addr, .join = corosync_join, .leave = corosync_leave, .notify = corosync_notify, .block = corosync_block, .unblock = corosync_unblock, .update_node = corosync_update_node, }; cdrv_register(cdrv_corosync); sheepdog-0.7.5/sheep/cluster/local.c000066400000000000000000000266731223630776600173760ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include "cluster.h" #include "event.h" #include "work.h" #include "util.h" #define MAX_EVENTS 500 #define PROCESS_CHECK_INTERVAL 50 /* ms */ static const char *shmfile = "/tmp/sheepdog_shm"; static int shmfd; static int sigfd; static int block_event_pos; static int nonblock_event_pos; static struct local_node this_node; static bool joined; struct local_node { struct sd_node node; pid_t pid; bool gateway; }; static const char *lnode_to_str(struct local_node *lnode) { static __thread char s[MAX_NODE_STR_LEN + 32]; snprintf(s, sizeof(s), "%s pid:%d", node_to_str(&lnode->node), lnode->pid); return s; } static int lnode_cmp(const struct local_node *a, const struct local_node *b) { return node_cmp(&a->node, &b->node); } static bool lnode_eq(const struct local_node *a, const struct local_node *b) { return lnode_cmp(a, b) == 0; } enum local_event_type { EVENT_JOIN = 1, EVENT_ACCEPT, EVENT_LEAVE, EVENT_GATEWAY, EVENT_BLOCK, EVENT_NOTIFY, EVENT_UPDATE_NODE, }; struct local_event { enum local_event_type type; struct local_node sender; bool callbacked; bool removed; size_t buf_len; uint8_t buf[SD_MAX_EVENT_BUF_SIZE]; size_t nr_lnodes; /* the number of sheep processes */ struct local_node lnodes[SD_MAX_NODES]; }; /* shared memory queue */ static struct shm_queue { int block_event_pos; struct local_event block_events[MAX_EVENTS]; int nonblock_event_pos; struct local_event nonblock_events[MAX_EVENTS]; } *shm_queue; static void shm_queue_lock(void) { flock(shmfd, LOCK_EX); } static void shm_queue_unlock(void) { flock(shmfd, LOCK_UN); } static size_t get_nodes(struct local_node *n) { struct local_event *ev; ev = shm_queue->nonblock_events + shm_queue->nonblock_event_pos; if (n) memcpy(n, ev->lnodes, sizeof(ev->lnodes)); return ev->nr_lnodes; } static int process_exists(pid_t pid) { return kill(pid, 0) == 0; } static struct local_event *shm_queue_peek_block_event(void) { return shm_queue->block_events + (block_event_pos + 1) % MAX_EVENTS; } static struct local_event *shm_queue_peek_nonblock_event(void) { return shm_queue->nonblock_events + (nonblock_event_pos + 1) % MAX_EVENTS; } static struct local_event *shm_queue_peek(void) { /* try to peek nonblock queue first */ if (nonblock_event_pos != shm_queue->nonblock_event_pos) return shm_queue_peek_nonblock_event(); else if (block_event_pos != shm_queue->block_event_pos) return shm_queue_peek_block_event(); else return NULL; } static void shm_queue_push(struct local_event *ev) { int pos; if (ev->type == EVENT_BLOCK) { pos = (shm_queue->block_event_pos + 1) % MAX_EVENTS; shm_queue->block_events[pos] = *ev; msync(shm_queue->block_events + pos, sizeof(*ev), MS_SYNC); shm_queue->block_event_pos = pos; msync(&shm_queue->block_event_pos, sizeof(pos), MS_SYNC); } else { pos = (shm_queue->nonblock_event_pos + 1) % MAX_EVENTS; shm_queue->nonblock_events[pos] = *ev; msync(shm_queue->nonblock_events + pos, sizeof(*ev), MS_SYNC); shm_queue->nonblock_event_pos = pos; msync(&shm_queue->nonblock_event_pos, sizeof(pos), MS_SYNC); } } static void shm_queue_remove(struct local_event *ev) { if (ev == shm_queue_peek_block_event()) block_event_pos = (block_event_pos + 1) % MAX_EVENTS; else nonblock_event_pos = (nonblock_event_pos + 1) % MAX_EVENTS; } static void shm_queue_notify(void) { int i; size_t nr; struct local_node lnodes[SD_MAX_NODES]; nr = get_nodes(lnodes); for (i = 0; i < nr; i++) { sd_debug("send signal to %s", lnode_to_str(lnodes + i)); kill(lnodes[i].pid, SIGUSR1); } } static bool is_shm_queue_valid(void) { int i; size_t nr; struct local_node lnodes[SD_MAX_NODES]; nr = get_nodes(lnodes); if (nr == 0) return true; for (i = 0; i < nr; i++) if (process_exists(lnodes[i].pid)) return true; return false; } static void shm_queue_init(void) { int ret; shmfd = open(shmfile, O_CREAT | O_RDWR, 0644); if (shmfd < 0) panic("cannot open shared file, %s", shmfile); shm_queue_lock(); ret = xftruncate(shmfd, sizeof(*shm_queue)); if (ret != 0) panic("failed to truncate shmfile, %m"); shm_queue = mmap(NULL, sizeof(*shm_queue), PROT_READ | PROT_WRITE, MAP_SHARED, shmfd, 0); if (shm_queue == MAP_FAILED) panic("mmap error, %m"); if (is_shm_queue_valid()) { block_event_pos = shm_queue->block_event_pos; nonblock_event_pos = shm_queue->nonblock_event_pos; } else { /* initialize shared memory */ block_event_pos = 0; nonblock_event_pos = 0; ret = xftruncate(shmfd, 0); if (ret != 0) panic("failed to truncate shmfile, %m"); ret = xftruncate(shmfd, sizeof(*shm_queue)); if (ret != 0) panic("failed to truncate shmfile, %m"); } shm_queue_unlock(); } static int add_event(enum local_event_type type, struct local_node *lnode, void *buf, size_t buf_len) { struct local_node *n; struct local_event ev = { .type = type, .sender = *lnode, }; ev.buf_len = buf_len; if (buf) memcpy(ev.buf, buf, buf_len); ev.nr_lnodes = get_nodes(ev.lnodes); switch (type) { case EVENT_JOIN: ev.lnodes[ev.nr_lnodes] = *lnode; ev.nr_lnodes++; break; case EVENT_LEAVE: xlremove(lnode, ev.lnodes, &ev.nr_lnodes, lnode_cmp); break; case EVENT_GATEWAY: n = xlfind(lnode, ev.lnodes, ev.nr_lnodes, lnode_cmp); n->gateway = true; break; case EVENT_NOTIFY: case EVENT_BLOCK: break; case EVENT_UPDATE_NODE: n = xlfind(lnode, ev.lnodes, ev.nr_lnodes, lnode_cmp); n->node = lnode->node; break; case EVENT_ACCEPT: abort(); } sd_debug("type = %d, sender = %s", ev.type, lnode_to_str(&ev.sender)); for (int i = 0; i < ev.nr_lnodes; i++) sd_debug("%d: %s", i, lnode_to_str(ev.lnodes + i)); shm_queue_push(&ev); shm_queue_notify(); return SD_RES_SUCCESS; } static int add_event_lock(enum local_event_type type, struct local_node *lnode, void *buf, size_t buf_len) { int ret; shm_queue_lock(); ret = add_event(type, lnode, buf, buf_len); shm_queue_unlock(); return ret; } static void check_pids(void *arg) { int i; size_t nr; struct local_node lnodes[SD_MAX_NODES]; struct local_event *ev; shm_queue_lock(); nr = get_nodes(lnodes); for (i = 0; i < nr; i++) if (!process_exists(lnodes[i].pid)) { add_event(EVENT_LEAVE, lnodes + i, NULL, 0); /* unblock blocking event if sender has gone */ ev = shm_queue_peek_block_event(); if (lnode_eq(lnodes + i, &ev->sender)) { ev->removed = true; msync(ev, sizeof(*ev), MS_SYNC); } } shm_queue_unlock(); add_timer(arg, PROCESS_CHECK_INTERVAL); } /* Local driver APIs */ static int local_join(const struct sd_node *myself, void *opaque, size_t opaque_len) { this_node.node = *myself; this_node.pid = getpid(); this_node.gateway = false; return add_event_lock(EVENT_JOIN, &this_node, opaque, opaque_len); } static int local_leave(void) { return add_event_lock(EVENT_GATEWAY, &this_node, NULL, 0); } static int local_notify(void *msg, size_t msg_len) { return add_event_lock(EVENT_NOTIFY, &this_node, msg, msg_len); } static int local_block(void) { return add_event_lock(EVENT_BLOCK, &this_node, NULL, 0); } static int local_unblock(void *msg, size_t msg_len) { struct local_event *ev; shm_queue_lock(); ev = shm_queue_peek_block_event(); ev->removed = true; msync(ev, sizeof(*ev), MS_SYNC); add_event(EVENT_NOTIFY, &this_node, msg, msg_len); shm_queue_unlock(); return SD_RES_SUCCESS; } /* Returns true if an event is processed */ static bool local_process_event(void) { struct local_event *ev; int i; struct sd_node nodes[SD_MAX_NODES]; size_t nr_nodes; ev = shm_queue_peek(); if (!ev) return false; sd_debug("type = %d, sender = %s", ev->type, lnode_to_str(&ev->sender)); sd_debug("callbacked = %d, removed = %d", ev->callbacked, ev->removed); nr_nodes = 0; for (i = 0; i < ev->nr_lnodes; i++) { sd_debug("%d: %s", i, lnode_to_str(ev->lnodes + i)); if (!ev->lnodes[i].gateway) nodes[nr_nodes++] = ev->lnodes[i].node; } if (ev->removed) goto out; if (ev->callbacked) return false; /* wait for unblock event */ if (!joined) { if (!lnode_eq(&this_node, &ev->sender)) goto out; switch (ev->type) { case EVENT_JOIN: break; case EVENT_ACCEPT: sd_debug("join Sheepdog"); joined = true; break; default: goto out; } } switch (ev->type) { case EVENT_JOIN: /* nodes[nr_nodes - 1] is a sender, so don't include it */ assert(node_eq(&ev->sender.node, &nodes[nr_nodes - 1])); if (sd_join_handler(&ev->sender.node, nodes, nr_nodes - 1, ev->buf)) { ev->type = EVENT_ACCEPT; msync(ev, sizeof(*ev), MS_SYNC); shm_queue_notify(); } return false; case EVENT_ACCEPT: sd_accept_handler(&ev->sender.node, nodes, nr_nodes, ev->buf); break; case EVENT_LEAVE: if (ev->sender.gateway) { sd_debug("gateway %s left sheepdog", lnode_to_str(&ev->sender)); break; } /* fall through */ case EVENT_GATEWAY: sd_leave_handler(&ev->sender.node, nodes, nr_nodes); break; case EVENT_BLOCK: ev->callbacked = sd_block_handler(&ev->sender.node); msync(ev, sizeof(*ev), MS_SYNC); return false; case EVENT_NOTIFY: sd_notify_handler(&ev->sender.node, ev->buf, ev->buf_len); break; case EVENT_UPDATE_NODE: if (lnode_eq(&ev->sender, &this_node)) this_node = ev->sender; sd_update_node_handler(&ev->sender.node); break; } out: shm_queue_remove(ev); return true; } static void local_handler(int listen_fd, int events, void *data) { struct signalfd_siginfo siginfo; int ret; if (events & EPOLLHUP) { sd_err("local driver received EPOLLHUP event, exiting."); log_close(); exit(1); } sd_debug("read siginfo"); ret = read(sigfd, &siginfo, sizeof(siginfo)); if (ret != sizeof(siginfo)) panic("failed to read from sigfd, %m"); shm_queue_lock(); while (local_process_event()) ; shm_queue_unlock(); } static int local_get_local_addr(uint8_t *myaddr) { /* set 127.0.0.1 */ memset(myaddr, 0, 16); myaddr[12] = 127; myaddr[15] = 1; return 0; } static int local_init(const char *option) { sigset_t mask; int ret; static struct timer t = { .callback = check_pids, .data = &t, }; if (option) shmfile = option; shm_queue_init(); sigemptyset(&mask); sigaddset(&mask, SIGUSR1); sigprocmask(SIG_BLOCK, &mask, NULL); sigfd = signalfd(-1, &mask, SFD_NONBLOCK); if (sigfd < 0) { sd_err("failed to create a signal fd: %m"); return -1; } add_timer(&t, PROCESS_CHECK_INTERVAL); ret = register_event(sigfd, local_handler, NULL); if (ret) { sd_err("failed to register local event handler (%d)", ret); return -1; } return 0; } static int local_update_node(struct sd_node *node) { struct local_node lnode = this_node; lnode.node = *node; return add_event_lock(EVENT_UPDATE_NODE, &lnode, NULL, 0); } static struct cluster_driver cdrv_local = { .name = "local", .init = local_init, .get_local_addr = local_get_local_addr, .join = local_join, .leave = local_leave, .notify = local_notify, .block = local_block, .unblock = local_unblock, .update_node = local_update_node, }; cdrv_register(cdrv_local); sheepdog-0.7.5/sheep/cluster/shepherd.c000066400000000000000000000327211223630776600200750ustar00rootroot00000000000000/* * Copyright (C) 2013 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include "cluster.h" #include "event.h" #include "shepherd.h" #include "internal_proto.h" #include "net.h" static int sph_comm_fd; static struct sd_node this_node; static int nr_nodes; static struct sd_node nodes[SD_MAX_NODES]; enum sph_driver_state { STATE_PRE_JOIN, STATE_JOINED, }; static enum sph_driver_state state = STATE_PRE_JOIN; static char *kept_opaque; static size_t kept_opaque_len; static int do_shepherd_join(void) { int ret, msg_join_len; struct sph_msg msg; struct sph_msg_join *msg_join; msg_join_len = sizeof(struct sph_msg_join) + kept_opaque_len; memset(&msg, 0, sizeof(msg)); msg.type = SPH_CLI_MSG_JOIN; msg.body_len = msg_join_len; msg_join = xzalloc(msg_join_len); msg_join->new_node = this_node; memcpy(msg_join->opaque, kept_opaque, kept_opaque_len); ret = writev2(sph_comm_fd, &msg, msg_join, msg_join_len); if (sizeof(msg) + msg_join_len != ret) { sd_err("do_shepherd_join() failed, %m"); free(msg_join); return -1; } free(msg_join); return 0; } static void read_msg(struct sph_msg *rcv) { int ret; ret = xread(sph_comm_fd, rcv, sizeof(*rcv)); if (ret != sizeof(*rcv)) { sd_err("xread() failed: %m"); exit(1); } } static void interpret_msg_pre_join(void) { int ret; struct sph_msg snd, rcv; struct sph_msg_join_reply *join_reply; retry: read_msg(&rcv); if (rcv.type == SPH_SRV_MSG_JOIN_RETRY) { sd_info("join request is rejected, retrying"); do_shepherd_join(); goto retry; } else if (rcv.type == SPH_SRV_MSG_NEW_NODE) { struct sph_msg_join *join; int join_len; join_len = rcv.body_len; join = xzalloc(join_len); ret = xread(sph_comm_fd, join, join_len); if (ret != join_len) { sd_err("xread() failed: %m"); exit(1); } /* * FIXME: member change events must be ordered with nonblocked * events */ if (!sd_join_handler(&join->new_node, NULL, 0, join->opaque)) panic("sd_accept_handler() failed"); snd.type = SPH_CLI_MSG_ACCEPT; snd.body_len = join_len; ret = writev2(sph_comm_fd, &snd, join, join_len); if (sizeof(snd) + join_len != ret) { sd_err("writev2() failed: %m"); exit(1); } free(join); read_msg(&rcv); } if (rcv.type != SPH_SRV_MSG_JOIN_REPLY) { sd_err("unexpected message from shepherd, received message: %s", sph_srv_msg_to_str(rcv.type)); /* * In this case, the state of this sheep in shepherd must be * SHEEP_STATE_CONNECTED. Messages other than SPH_MSG_JOIN_REPLY * mean bugs of shepherd. */ exit(1); } join_reply = xzalloc(rcv.body_len); ret = xread(sph_comm_fd, join_reply, rcv.body_len); if (ret != rcv.body_len) { sd_err("xread() failed: %m"); exit(1); } sd_info("join reply arrived, nr_nodes: %d", join_reply->nr_nodes); memcpy(nodes, join_reply->nodes, join_reply->nr_nodes * sizeof(struct sd_node)); nr_nodes = join_reply->nr_nodes; /* FIXME: member change events must be ordered with nonblocked events */ sd_accept_handler(&this_node, nodes, nr_nodes, join_reply->opaque); free(join_reply); sd_info("shepherd_join() succeed"); state = STATE_JOINED; } struct sph_event { struct sd_node sender; void *msg; int msg_len; bool callbacked, removed; struct list_head event_list; }; static LIST_HEAD(nonblocked_event_list); static LIST_HEAD(blocked_event_list); static int sph_event_fd; static bool sph_process_event(void) { struct sph_event *ev; bool nonblock; if (!list_empty(&nonblocked_event_list)) { ev = list_first_entry(&nonblocked_event_list, struct sph_event, event_list); nonblock = true; } else if (!list_empty(&blocked_event_list)) { ev = list_first_entry(&blocked_event_list, struct sph_event, event_list); nonblock = false; } else return false; if (ev->removed) goto remove; if (ev->callbacked) return false; if (nonblock) { sd_debug("processing nonblock event"); sd_notify_handler(&ev->sender, ev->msg, ev->msg_len); } else { sd_debug("processing block event"); ev->callbacked = sd_block_handler(&ev->sender); return false; } remove: list_del(&ev->event_list); free(ev->msg); free(ev); return true; } static void push_sph_event(bool nonblock, struct sd_node *sender, void *msg, int msg_len) { struct sph_event *ev; sd_debug("push_sph_event() called, pushing %sblocking event", nonblock ? "non" : ""); ev = xzalloc(sizeof(*ev)); ev->sender = *sender; if (msg_len) { ev->msg = xzalloc(msg_len); memcpy(ev->msg, msg, msg_len); ev->msg_len = msg_len; } ev->removed = false; ev->callbacked = false; INIT_LIST_HEAD(&ev->event_list); if (nonblock) list_add_tail(&ev->event_list, &nonblocked_event_list); else list_add_tail(&ev->event_list, &blocked_event_list); eventfd_xwrite(sph_event_fd, 1); } static void remove_one_block_event(void) { struct sph_event *ev; bool removed = false; if (list_empty(&blocked_event_list)) /* FIXME: should I treat this case as an error? */ return; list_for_each_entry(ev, &blocked_event_list, event_list) { if (ev->removed) continue; removed = ev->removed = true; break; } if (!removed) panic("removed is not true"); eventfd_xwrite(sph_event_fd, 1); sd_debug("unblock a blocking event"); } static void sph_event_handler(int fd, int events, void *data) { eventfd_xread(fd); while (sph_process_event()) ; } static void msg_new_node(struct sph_msg *rcv) { int ret; struct sph_msg_join *join; struct sph_msg snd; join = xzalloc(rcv->body_len); ret = xread(sph_comm_fd, join, rcv->body_len); if (ret != rcv->body_len) { sd_err("xread() failed: %m"); exit(1); } /* FIXME: member change events must be ordered with nonblocked events */ if (!sd_join_handler(&join->new_node, join->nodes, join->nr_nodes, join->opaque)) /* * This should succeed always because shepherd should have sent * SPH_SRV_MSG_NEW_NODE only to the already joined node. */ panic("sd_join_handler() failed"); memset(&snd, 0, sizeof(snd)); snd.type = SPH_CLI_MSG_ACCEPT; snd.body_len = rcv->body_len; ret = writev2(sph_comm_fd, &snd, join, rcv->body_len); if (sizeof(snd) + rcv->body_len != ret) { sd_err("writev() failed: %m"); exit(1); } free(join); } static void msg_new_node_finish(struct sph_msg *rcv) { int ret; struct sph_msg_join_node_finish *join_node_finish; join_node_finish = xzalloc(rcv->body_len); ret = xread(sph_comm_fd, join_node_finish, rcv->body_len); if (ret != rcv->body_len) { sd_err("xread() failed: %m"); exit(1); } memcpy(nodes, join_node_finish->nodes, join_node_finish->nr_nodes * sizeof(struct sd_node)); nr_nodes = join_node_finish->nr_nodes; sd_info("new node: %s", node_to_str(&join_node_finish->new_node)); /* FIXME: member change events must be ordered with nonblocked events */ sd_accept_handler(&join_node_finish->new_node, nodes, nr_nodes, join_node_finish->opaque); free(join_node_finish); } static void msg_notify_forward(struct sph_msg *rcv) { int ret; struct sph_msg_notify_forward *notify_forward; notify_forward = xzalloc(rcv->body_len); ret = xread(sph_comm_fd, notify_forward, rcv->body_len); if (ret != rcv->body_len) { sd_err("xread() failed: %m"); exit(1); } if (notify_forward->unblock) remove_one_block_event(); push_sph_event(true, ¬ify_forward->from_node, notify_forward->notify_msg, rcv->body_len - sizeof(*notify_forward)); free(notify_forward); } static void msg_block_forward(struct sph_msg *rcv) { int ret; struct sd_node sender; ret = xread(sph_comm_fd, &sender, sizeof(sender)); if (ret != sizeof(sender)) { sd_err("xread() failed: %m"); exit(1); } push_sph_event(false, &sender, NULL, 0); } static void do_leave_sheep(void) { int ret; struct sd_node sender; ret = xread(sph_comm_fd, &sender, sizeof(sender)); if (ret != sizeof(sender)) { sd_err("xread() failed: %m"); exit(1); } sd_info("removing node: %s", node_to_str(&sender)); if (xlremove(&sender, nodes, &nr_nodes, node_cmp)) goto removed; sd_info("leave message from unknown node: %s", node_to_str(&sender)); return; removed: sd_debug("calling sd_leave_handler(), sender: %s", node_to_str(&sender)); /* FIXME: member change events must be ordered with nonblocked events */ sd_leave_handler(&sender, nodes, nr_nodes); } static void msg_remove(struct sph_msg *rcv) { sd_info("sudden leaving of sheep is caused"); do_leave_sheep(); } static void msg_leave_forward(struct sph_msg *rcv) { sd_info("intuitive leaving of sheep is caused"); do_leave_sheep(); } static void (*msg_handlers[])(struct sph_msg *) = { [SPH_SRV_MSG_NEW_NODE] = msg_new_node, [SPH_SRV_MSG_NEW_NODE_FINISH] = msg_new_node_finish, [SPH_SRV_MSG_NOTIFY_FORWARD] = msg_notify_forward, [SPH_SRV_MSG_BLOCK_FORWARD] = msg_block_forward, [SPH_SRV_MSG_REMOVE] = msg_remove, [SPH_SRV_MSG_LEAVE_FORWARD] = msg_leave_forward, }; static void interpret_msg(struct sph_msg *rcv) { if (!(0 <= rcv->type && rcv->type < ARRAY_SIZE(msg_handlers))) { sd_err("invalid message from shepherd: %s", sph_srv_msg_to_str(rcv->type)); exit(1); } msg_handlers[rcv->type](rcv); } static void read_msg_from_shepherd(void) { struct sph_msg rcv; switch (state) { case STATE_PRE_JOIN: interpret_msg_pre_join(); break; case STATE_JOINED: read_msg(&rcv); interpret_msg(&rcv); break; default: panic("invalid state of shepherd cluster driver: %d", state); break; }; } static void shepherd_comm_handler(int fd, int events, void *data) { assert(fd == sph_comm_fd); assert(data == NULL); if (events & EPOLLIN) read_msg_from_shepherd(); else if (events & EPOLLHUP || events & EPOLLERR) { sd_err("connection to shepherd caused an error: %m"); exit(1); } } static int shepherd_init(const char *option) { int ret, port; char *copied, *s_addr, *s_port, *saveptr; if (!option) { sd_err("shepherd cluster driver requires at least IP" " address of shepherd as an option"); exit(1); } copied = strdup(option); if (!copied) { sd_err("strdup() failed: %m"); exit(1); } s_addr = strtok_r(copied, ":", &saveptr); if (!s_addr) { sd_err("strdup() failed: %m"); exit(1); } s_port = strtok_r(NULL, ":", &saveptr); if (s_port) { char *p; port = strtol(s_port, &p, 10); if (*p != '\0') { sd_err("invalid option for host and port: %s", option); exit(1); } } else port = SHEPHERD_PORT; sph_comm_fd = connect_to(s_addr, port); if (sph_comm_fd == -1) { sd_err("cannot connect to shepherd," " is shepherd running? errno: %m"); return -1; } sph_event_fd = eventfd(0, EFD_NONBLOCK); ret = register_event(sph_event_fd, sph_event_handler, NULL); if (ret) { sd_err("register_event() failed: %m"); exit(1); } free(copied); return 0; } static int shepherd_join(const struct sd_node *myself, void *opaque, size_t opaque_len) { int ret; static bool registered; /* keep opaque for retrying */ kept_opaque = xzalloc(opaque_len); memcpy(kept_opaque, opaque, opaque_len); kept_opaque_len = opaque_len; this_node = *myself; sd_debug("shepherd_join() called, myself is %s", node_to_str(myself)); ret = do_shepherd_join(); if (!registered) { register_event(sph_comm_fd, shepherd_comm_handler, NULL); registered = true; } return ret; } static int shepherd_leave(void) { int ret; struct sph_msg msg; msg.type = SPH_CLI_MSG_LEAVE; msg.body_len = 0; ret = xwrite(sph_comm_fd, &msg, sizeof(msg)); if (ret != sizeof(msg)) { sd_info("xwrite() failed: %m"); exit(1); } sd_debug("shepherd_leave() is completed"); return 0; } static int do_shepherd_notify(bool unblock, void *msg, size_t msg_len) { int ret; struct sph_msg snd; struct sph_msg_notify *notify; snd.type = SPH_CLI_MSG_NOTIFY; snd.body_len = msg_len + sizeof(*notify); notify = xzalloc(snd.body_len); notify->unblock = unblock; memcpy(notify->notify_msg, msg, msg_len); ret = writev2(sph_comm_fd, &snd, notify, snd.body_len); if (sizeof(snd) + snd.body_len != ret) { sd_err("writev() failed: %m"); exit(1); } free(notify); sd_info("do_shepherd_notify() is completed"); return 0; } static int shepherd_notify(void *msg, size_t msg_len) { return do_shepherd_notify(false, msg, msg_len) == 0 ? SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR; } static int shepherd_block(void) { int ret; struct sph_msg msg; msg.type = SPH_CLI_MSG_BLOCK; msg.body_len = 0; ret = xwrite(sph_comm_fd, &msg, sizeof(msg)); if (ret != sizeof(msg)) { sd_err("xwrite() failed: %m"); exit(1); } return SD_RES_SUCCESS; } static int shepherd_unblock(void *msg, size_t msg_len) { return do_shepherd_notify(true, msg, msg_len) == 0 ? SD_RES_SUCCESS : SD_RES_CLUSTER_ERROR; } /* FIXME: shepherd server also has to udpate node information */ static int shepherd_update_node(struct sd_node *node) { return SD_RES_NO_SUPPORT; } static struct cluster_driver cdrv_shepherd = { .name = "shepherd", .init = shepherd_init, .join = shepherd_join, .leave = shepherd_leave, .notify = shepherd_notify, .block = shepherd_block, .unblock = shepherd_unblock, .update_node = shepherd_update_node, .get_local_addr = get_local_addr, }; cdrv_register(cdrv_shepherd); sheepdog-0.7.5/sheep/cluster/zookeeper.c000066400000000000000000000654431223630776600203050ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * Copyright (C) 2012 Taobao Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include #include #include "cluster.h" #include "config.h" #include "event.h" #include "work.h" #include "util.h" #include "rbtree.h" #define SESSION_TIMEOUT 30000 /* millisecond */ #define BASE_ZNODE "/sheepdog" #define QUEUE_ZNODE BASE_ZNODE "/queue" #define MEMBER_ZNODE BASE_ZNODE "/member" #define MASTER_ZNONE BASE_ZNODE "/master" /* iterate child znodes */ #define FOR_EACH_ZNODE(parent, path, strs) \ for ((strs)->data += (strs)->count; \ (strs)->count-- ? \ snprintf(path, sizeof(path), "%s/%s", parent, \ *--(strs)->data) : (free((strs)->data), 0); \ free(*(strs)->data)) enum zk_event_type { EVENT_JOIN = 1, EVENT_ACCEPT, EVENT_LEAVE, EVENT_BLOCK, EVENT_UNBLOCK, EVENT_NOTIFY, EVENT_UPDATE_NODE, }; struct zk_node { struct list_head list; struct rb_node rb; struct sd_node node; bool callbacked; bool gone; }; struct zk_event { uint64_t id; enum zk_event_type type; struct zk_node sender; size_t msg_len; size_t nr_nodes; size_t buf_len; uint8_t buf[SD_MAX_EVENT_BUF_SIZE]; }; static struct sd_node sd_nodes[SD_MAX_NODES]; static size_t nr_sd_nodes; static struct rb_root zk_node_root = RB_ROOT; static struct sd_lock zk_tree_lock = SD_LOCK_INITIALIZER; static struct sd_lock zk_compete_master_lock = SD_LOCK_INITIALIZER; static LIST_HEAD(zk_block_list); static uatomic_bool is_master; static uatomic_bool stop; static bool joined; static bool first_push = true; static void zk_compete_master(void); static struct zk_node *zk_tree_insert(struct zk_node *new) { struct rb_node **p = &zk_node_root.rb_node; struct rb_node *parent = NULL; struct zk_node *entry; while (*p) { int cmp; parent = *p; entry = rb_entry(parent, struct zk_node, rb); cmp = node_cmp(&new->node, &entry->node); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) p = &(*p)->rb_right; else /* already has this entry */ return entry; } rb_link_node(&new->rb, parent, p); rb_insert_color(&new->rb, &zk_node_root); return NULL; /* insert successfully */ } static struct zk_node *zk_tree_search_nolock(const struct node_id *nid) { struct rb_node *n = zk_node_root.rb_node; struct zk_node *t; while (n) { int cmp; t = rb_entry(n, struct zk_node, rb); cmp = node_id_cmp(nid, &t->node.nid); if (cmp < 0) n = n->rb_left; else if (cmp > 0) n = n->rb_right; else return t; /* found it */ } return NULL; } static inline struct zk_node *zk_tree_search(const struct node_id *nid) { struct zk_node *n; sd_read_lock(&zk_tree_lock); n = zk_tree_search_nolock(nid); sd_unlock(&zk_tree_lock); return n; } /* zookeeper API wrapper */ static zhandle_t *zhandle; static struct zk_node this_node; #define CHECK_ZK_RC(rc, path) \ switch (rc) { \ case ZNONODE: \ case ZNODEEXISTS: \ break; \ case ZINVALIDSTATE: \ case ZSESSIONEXPIRED: \ case ZOPERATIONTIMEOUT: \ case ZCONNECTIONLOSS: \ sd_err("failed, path:%s, %s", path, zerror(rc)); \ case ZOK: \ break; \ case ZNOCHILDRENFOREPHEMERALS: \ /* \ * Because code has guaranteed that parent nodes are \ * always non-ephemeral, this could happen only when \ * sheep joins a cluster in an incompatible version. \ */ \ sd_err("incompatible version of sheep %s", \ PACKAGE_VERSION); \ default: \ panic("failed, path:%s, %s", path, zerror(rc)); \ } #define RETURN_IF_ERROR(stmt, fmt, ...) \ do { \ int __rc = stmt; \ if (__rc != ZOK) { \ sd_err("failed, " fmt ", %s", \ ##__VA_ARGS__, zerror(__rc)); \ return __rc; \ } \ } while (0) #define RETURN_VOID_IF_ERROR(stmt, fmt, ...) \ do { \ int __rc = stmt; \ if (__rc != ZOK) { \ sd_err("failed, " fmt ", %s", \ ##__VA_ARGS__, zerror(__rc)); \ return; \ } \ } while (0) static inline ZOOAPI int zk_delete_node(const char *path, int version) { int rc; do { rc = zoo_delete(zhandle, path, version); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } static inline ZOOAPI int zk_init_node(const char *path) { int rc; do { rc = zoo_create(zhandle, path, "", 0, &ZOO_OPEN_ACL_UNSAFE, 0, NULL, 0); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); if (rc == ZNODEEXISTS) rc = ZOK; return rc; } static inline ZOOAPI int zk_create_node(const char *path, const char *value, int valuelen, const struct ACL_vector *acl, int flags, char *path_buffer, int path_buffer_len) { int rc; do { rc = zoo_create(zhandle, path, value, valuelen, acl, flags, path_buffer, path_buffer_len); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } /* * Create a znode after adding a unique monotonically increasing sequence number * to the path name. * * Note that the caller has to retry this function when this returns * ZOPERATIONTIMEOUT or ZCONNECTIONLOSS and the znode is not created. */ static inline ZOOAPI int zk_create_seq_node(const char *path, const char *value, int valuelen, char *path_buffer, int path_buffer_len, bool ephemeral) { int rc; int flags = ZOO_SEQUENCE; if (ephemeral) flags = flags | ZOO_EPHEMERAL; rc = zoo_create(zhandle, path, value, valuelen, &ZOO_OPEN_ACL_UNSAFE, flags, path_buffer, path_buffer_len); CHECK_ZK_RC(rc, path); return rc; } static inline ZOOAPI int zk_get_data(const char *path, void *buffer, int *buffer_len) { int rc; do { rc = zoo_get(zhandle, path, 1, (char *)buffer, buffer_len, NULL); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } static inline ZOOAPI int zk_set_data(const char *path, const char *buffer, int buflen, int version) { int rc; do { rc = zoo_set(zhandle, path, buffer, buflen, version); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } static inline ZOOAPI int zk_node_exists(const char *path) { int rc; do { rc = zoo_exists(zhandle, path, 1, NULL); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } static inline ZOOAPI int zk_get_children(const char *path, struct String_vector *strings) { int rc; do { rc = zoo_get_children(zhandle, path, 1, strings); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, path); return rc; } /* ZooKeeper-based queue give us an totally ordered events */ static int efd; static int32_t queue_pos; static int zk_queue_peek(bool *peek) { int rc; char path[MAX_NODE_STR_LEN]; snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos); rc = zk_node_exists(path); switch (rc) { case ZOK: *peek = true; return ZOK; case ZNONODE: *peek = false; return ZOK; default: sd_err("failed, %s", zerror(rc)); return rc; } } /* return true if there is a node with 'id' in the queue. */ static int zk_find_seq_node(uint64_t id, char *seq_path, int seq_path_len, bool *found) { int rc, len; for (int seq = queue_pos; ; seq++) { struct zk_event ev; snprintf(seq_path, seq_path_len, QUEUE_ZNODE"/%010"PRId32, seq); len = offsetof(typeof(ev), id) + sizeof(ev.id); rc = zk_get_data(seq_path, &ev, &len); switch (rc) { case ZOK: if (ev.id == id) { sd_debug("id %" PRIx64 " is found in %s", id, seq_path); *found = true; return ZOK; } break; case ZNONODE: sd_debug("id %"PRIx64" is not found", id); *found = false; return ZOK; default: sd_err("failed, %s", zerror(rc)); return rc; } } } static int zk_queue_push(struct zk_event *ev) { int rc, len; char path[MAX_NODE_STR_LEN], buf[MAX_NODE_STR_LEN]; bool found; len = offsetof(typeof(*ev), buf) + ev->buf_len; snprintf(path, sizeof(path), "%s/", QUEUE_ZNODE); again: rc = zk_create_seq_node(path, (char *)ev, len, buf, sizeof(buf), false); switch (rc) { case ZOK: /* Success */ break; case ZOPERATIONTIMEOUT: case ZCONNECTIONLOSS: if (zk_find_seq_node(ev->id, buf, sizeof(buf), &found) == ZOK) { if (found) break; else /* retry if seq_node was not created */ goto again; } /* fall through */ default: sd_err("failed, path:%s, %s", path, zerror(rc)); return rc; } if (first_push) { int32_t seq; sscanf(buf, QUEUE_ZNODE "/%"PRId32, &seq); queue_pos = seq; eventfd_xwrite(efd, 1); first_push = false; } sd_debug("create path:%s, queue_pos:%010" PRId32 ", len:%d", buf, queue_pos, len); return ZOK; } static inline void *zk_event_sd_nodes(struct zk_event *ev) { return (char *)ev->buf + ev->msg_len; } /* Change the join event in place and piggyback the nodes information. */ static int push_join_response(struct zk_event *ev) { char path[MAX_NODE_STR_LEN]; int len; ev->type = EVENT_ACCEPT; ev->nr_nodes = nr_sd_nodes; memcpy(zk_event_sd_nodes(ev), sd_nodes, nr_sd_nodes * sizeof(struct sd_node)); queue_pos--; len = offsetof(typeof(*ev), buf) + ev->buf_len; snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos); RETURN_IF_ERROR(zk_set_data(path, (char *)ev, len, -1), ""); sd_debug("update path:%s, queue_pos:%010" PRId32 ", len:%d", path, queue_pos, len); return ZOK; } static int zk_queue_pop_advance(struct zk_event *ev) { int len; char path[MAX_NODE_STR_LEN]; len = sizeof(*ev); snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos); RETURN_IF_ERROR(zk_get_data(path, ev, &len), "path %s", path); sd_debug("%s, type:%d, len:%d, pos:%" PRId32, path, ev->type, len, queue_pos); queue_pos++; return ZOK; } static inline void zk_tree_add(struct zk_node *node) { struct zk_node *zk = xzalloc(sizeof(*zk)); *zk = *node; sd_write_lock(&zk_tree_lock); if (zk_tree_insert(zk)) { free(zk); goto out; } /* * Even node list will be built later, we need this because in master * transfer case, we need this information to destroy the tree. */ sd_nodes[nr_sd_nodes++] = zk->node; out: sd_unlock(&zk_tree_lock); } static inline void zk_tree_del_nolock(struct zk_node *node) { rb_erase(&node->rb, &zk_node_root); free(node); } static inline void zk_tree_del(struct zk_node *node) { sd_write_lock(&zk_tree_lock); zk_tree_del_nolock(node); sd_unlock(&zk_tree_lock); } static inline void zk_tree_destroy(void) { struct zk_node *zk; int i; sd_write_lock(&zk_tree_lock); for (i = 0; i < nr_sd_nodes; i++) { zk = zk_tree_search_nolock(&sd_nodes[i].nid); if (zk) zk_tree_del_nolock(zk); } sd_unlock(&zk_tree_lock); } static inline void build_node_list(void) { struct rb_node *n; struct zk_node *zk; nr_sd_nodes = 0; for (n = rb_first(&zk_node_root); n; n = rb_next(n)) { zk = rb_entry(n, struct zk_node, rb); sd_nodes[nr_sd_nodes++] = zk->node; } sd_debug("nr_sd_nodes:%zu", nr_sd_nodes); } static int zk_queue_init(void) { RETURN_IF_ERROR(zk_init_node(BASE_ZNODE), "path %s", BASE_ZNODE); RETURN_IF_ERROR(zk_init_node(MASTER_ZNONE), "path %s", MASTER_ZNONE); RETURN_IF_ERROR(zk_init_node(QUEUE_ZNODE), "path %s", QUEUE_ZNODE); RETURN_IF_ERROR(zk_init_node(MEMBER_ZNODE), "path %s", MEMBER_ZNODE); return ZOK; } /* Calculate a unique 64 bit integer from this_node and the sequence number. */ static uint64_t get_uniq_id(void) { static int seq; uint64_t id, n = uatomic_add_return(&seq, 1); id = fnv_64a_buf(&this_node, sizeof(this_node), FNV1A_64_INIT); id = fnv_64a_buf(&n, sizeof(n), id); return id; } static int add_event(enum zk_event_type type, struct zk_node *znode, void *buf, size_t buf_len) { struct zk_event ev; int rc; ev.id = get_uniq_id(); ev.type = type; ev.sender = *znode; ev.buf_len = buf_len; if (buf) memcpy(ev.buf, buf, buf_len); rc = zk_queue_push(&ev); if (rc == ZOK) return SD_RES_SUCCESS; else { sd_err("failed, type: %d, %s", type, zerror(rc)); return SD_RES_CLUSTER_ERROR; } } static void zk_watcher(zhandle_t *zh, int type, int state, const char *path, void *ctx) { struct zk_node znode; char str[MAX_NODE_STR_LEN], *p; int ret; if (type == ZOO_SESSION_EVENT && state == ZOO_EXPIRED_SESSION_STATE) { /* * do reconnect in main thread to avoid on-the-fly zookeeper * operations. */ eventfd_xwrite(efd, 1); return; } /* CREATED_EVENT 1, DELETED_EVENT 2, CHANGED_EVENT 3, CHILD_EVENT 4 */ sd_debug("path:%s, type:%d", path, type); if (type == ZOO_CREATED_EVENT || type == ZOO_CHANGED_EVENT) { ret = sscanf(path, MEMBER_ZNODE "/%s", str); if (ret == 1) zk_node_exists(path); /* kick off the event handler */ eventfd_xwrite(efd, 1); } else if (type == ZOO_DELETED_EVENT) { struct zk_node *n; ret = sscanf(path, MASTER_ZNONE "/%s", str); if (ret == 1) { zk_compete_master(); return; } ret = sscanf(path, MEMBER_ZNODE "/%s", str); if (ret != 1) return; p = strrchr(path, '/'); p++; str_to_node(p, &znode.node); /* FIXME: remove redundant leave events */ sd_read_lock(&zk_tree_lock); n = zk_tree_search_nolock(&znode.node.nid); if (n) n->gone = true; sd_unlock(&zk_tree_lock); if (n) add_event(EVENT_LEAVE, &znode, NULL, 0); } } /* * We placehold the enough space to piggyback the nodes information on join * response message so that every node can see the same membership view. */ static int add_join_event(void *msg, size_t msg_len) { struct zk_event ev; size_t len = msg_len + sizeof(struct sd_node) * SD_MAX_NODES; assert(len <= SD_MAX_EVENT_BUF_SIZE); ev.id = get_uniq_id(); ev.type = EVENT_JOIN; ev.sender = this_node; ev.msg_len = msg_len; ev.buf_len = len; if (msg) memcpy(ev.buf, msg, msg_len); return zk_queue_push(&ev); } static int zk_get_least_seq(const char *parent, char *least_seq_path, int path_len, void *buf, int *buf_len) { char path[MAX_NODE_STR_LEN], *p, *tmp; struct String_vector strs; int rc, least_seq = INT_MAX , seq; while (true) { RETURN_IF_ERROR(zk_get_children(parent, &strs), ""); FOR_EACH_ZNODE(parent, path, &strs) { p = strrchr(path, '/'); seq = strtol(++p, &tmp, 10); if (seq < least_seq) least_seq = seq; } snprintf(path, MAX_NODE_STR_LEN, "%s/%010"PRId32, parent, least_seq); rc = zk_get_data(path, buf, buf_len); switch (rc) { case ZOK: strncpy(least_seq_path, path, path_len); return ZOK; case ZNONODE: break; default: sd_err("failed, %s", zerror(rc)); return rc; } } } static int zk_find_master(int *master_seq, char *master_name) { int rc, len = MAX_NODE_STR_LEN; char master_compete_path[MAX_NODE_STR_LEN]; if (*master_seq < 0) { RETURN_IF_ERROR(zk_get_least_seq(MASTER_ZNONE, master_compete_path, MAX_NODE_STR_LEN, master_name, &len), ""); sscanf(master_compete_path, MASTER_ZNONE "/%"PRId32, master_seq); return ZOK; } else { while (true) { snprintf(master_compete_path, len, MASTER_ZNONE "/%010"PRId32, *master_seq); rc = zk_get_data(master_compete_path, master_name, &len); switch (rc) { case ZOK: return ZOK; case ZNONODE: sd_info("detect master leave, " "start to compete master"); (*master_seq)++; break; default: sd_err("failed, %s", zerror(rc)); return rc; } } } } /* * block until last sheep joined * last_sheep returns sequence number of last sheep or -1 if no previous sheep */ static int zk_verify_last_sheep_join(int seq, int *last_sheep) { int rc, len = MAX_NODE_STR_LEN; char path[MAX_NODE_STR_LEN], name[MAX_NODE_STR_LEN]; for (*last_sheep = seq - 1; *last_sheep >= 0; (*last_sheep)--) { snprintf(path, MAX_NODE_STR_LEN, MASTER_ZNONE "/%010"PRId32, *last_sheep); rc = zk_get_data(path, name, &len); switch (rc) { case ZNONODE: continue; case ZOK: break; default: sd_err("failed, %s", zerror(rc)); return rc; } if (!strcmp(name, node_to_str(&this_node.node))) continue; snprintf(path, MAX_NODE_STR_LEN, MEMBER_ZNODE "/%s", name); rc = zk_node_exists(path); switch (rc) { case ZOK: return ZOK; case ZNONODE: (*last_sheep)++; break; default: sd_err("failed, %s", zerror(rc)); return rc; } } return ZOK; } /* * Create sequential node under MASTER_ZNODE. * Sheep with least sequential number win the competition. */ static void zk_compete_master(void) { int rc, last_joined_sheep; char master_name[MAX_NODE_STR_LEN]; char my_compete_path[MAX_NODE_STR_LEN]; static int master_seq = -1, my_seq; /* * This is to protect master_seq and my_seq because this function will * be called by both main thread and zookeeper's event thread. */ sd_write_lock(&zk_compete_master_lock); if (uatomic_is_true(&is_master) || uatomic_is_true(&stop)) goto out_unlock; if (!joined) { sd_debug("start to compete master for the first time"); do { if (uatomic_is_true(&stop)) goto out_unlock; /* duplicate sequential node has no side-effect */ rc = zk_create_seq_node(MASTER_ZNONE "/", node_to_str(&this_node.node), MAX_NODE_STR_LEN, my_compete_path, MAX_NODE_STR_LEN, true); } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS); CHECK_ZK_RC(rc, MASTER_ZNONE "/"); if (rc != ZOK) goto out_unlock; sd_debug("my compete path: %s", my_compete_path); sscanf(my_compete_path, MASTER_ZNONE "/%"PRId32, &my_seq); } if (zk_find_master(&master_seq, master_name) != ZOK) goto out_unlock; if (!strcmp(master_name, node_to_str(&this_node.node))) goto success; else if (joined) { sd_debug("lost"); goto out_unlock; } else { if (zk_verify_last_sheep_join(my_seq, &last_joined_sheep) != ZOK) goto out_unlock; if (last_joined_sheep < 0) { /* all previous sheep has quit, i'm master */ master_seq = my_seq; goto success; } else { sd_debug("lost"); goto out_unlock; } } success: uatomic_set_true(&is_master); sd_debug("success"); out_unlock: sd_unlock(&zk_compete_master_lock); } static int zk_join(const struct sd_node *myself, void *opaque, size_t opaque_len) { int rc; char path[MAX_NODE_STR_LEN]; this_node.node = *myself; snprintf(path, sizeof(path), MEMBER_ZNODE "/%s", node_to_str(myself)); rc = zk_node_exists(path); if (rc == ZOK) { sd_err("Previous zookeeper session exist, shoot myself."); exit(1); } zk_compete_master(); RETURN_IF_ERROR(add_join_event(opaque, opaque_len), ""); return ZOK; } static int zk_leave(void) { char path[PATH_MAX]; sd_info("leaving from cluster"); uatomic_set_true(&stop); snprintf(path, sizeof(path), MEMBER_ZNODE"/%s", node_to_str(&this_node.node)); add_event(EVENT_LEAVE, &this_node, NULL, 0); zk_delete_node(path, -1); return 0; } static int zk_notify(void *msg, size_t msg_len) { return add_event(EVENT_NOTIFY, &this_node, msg, msg_len); } static int zk_block(void) { return add_event(EVENT_BLOCK, &this_node, NULL, 0); } static int zk_unblock(void *msg, size_t msg_len) { return add_event(EVENT_UNBLOCK, &this_node, msg, msg_len); } static void zk_handle_join(struct zk_event *ev) { sd_debug("sender: %s", node_to_str(&ev->sender.node)); if (!uatomic_is_true(&is_master)) { /* Let's await master acking the join-request */ queue_pos--; return; } sd_join_handler(&ev->sender.node, sd_nodes, nr_sd_nodes, ev->buf); push_join_response(ev); sd_debug("I'm the master now"); } static void watch_all_nodes(void) { struct String_vector strs; char path[MAX_NODE_STR_LEN]; RETURN_VOID_IF_ERROR(zk_get_children(MEMBER_ZNODE, &strs), ""); FOR_EACH_ZNODE(MEMBER_ZNODE, path, &strs) { RETURN_VOID_IF_ERROR(zk_node_exists(path), ""); } } static void init_node_list(struct zk_event *ev) { uint8_t *p = zk_event_sd_nodes(ev); size_t node_nr = ev->nr_nodes; int i; sd_debug("%zu", node_nr); for (i = 0; i < node_nr; i++) { struct zk_node zk; mempcpy(&zk.node, p, sizeof(struct sd_node)); zk_tree_add(&zk); p += sizeof(struct sd_node); } watch_all_nodes(); } static void zk_handle_accept(struct zk_event *ev) { char path[MAX_NODE_STR_LEN]; int rc; sd_debug("ACCEPT"); if (node_eq(&ev->sender.node, &this_node.node)) /* newly joined node */ init_node_list(ev); sd_debug("%s", node_to_str(&ev->sender.node)); snprintf(path, sizeof(path), MEMBER_ZNODE"/%s", node_to_str(&ev->sender.node)); if (node_eq(&ev->sender.node, &this_node.node)) { joined = true; sd_debug("create path:%s", path); rc = zk_create_node(path, (char *)zoo_client_id(zhandle), sizeof(clientid_t), &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL, NULL, 0); RETURN_VOID_IF_ERROR(rc, ""); } else zk_node_exists(path); zk_tree_add(&ev->sender); build_node_list(); sd_accept_handler(&ev->sender.node, sd_nodes, nr_sd_nodes, ev->buf); } static void kick_block_event(void) { struct zk_node *block; if (list_empty(&zk_block_list)) return; block = list_first_entry(&zk_block_list, typeof(*block), list); if (!block->callbacked) block->callbacked = sd_block_handler(&block->node); } static void block_event_list_del(struct zk_node *n) { struct zk_node *ev, *t; list_for_each_entry_safe(ev, t, &zk_block_list, list) { if (node_eq(&ev->node, &n->node)) { list_del(&ev->list); free(ev); } } } static void zk_handle_leave(struct zk_event *ev) { struct zk_node *n = zk_tree_search(&ev->sender.node.nid); if (!n) { sd_debug("can't find this leave node:%s, ignore it.", node_to_str(&ev->sender.node)); return; } block_event_list_del(n); zk_tree_del(n); build_node_list(); sd_leave_handler(&ev->sender.node, sd_nodes, nr_sd_nodes); } static void zk_handle_block(struct zk_event *ev) { struct zk_node *block = xzalloc(sizeof(*block)); sd_debug("BLOCK"); block->node = ev->sender.node; list_add_tail(&block->list, &zk_block_list); block = list_first_entry(&zk_block_list, typeof(*block), list); if (!block->callbacked) block->callbacked = sd_block_handler(&block->node); } static void zk_handle_unblock(struct zk_event *ev) { struct zk_node *block; sd_debug("UNBLOCK"); if (list_empty(&zk_block_list)) return; block = list_first_entry(&zk_block_list, typeof(*block), list); sd_notify_handler(&ev->sender.node, ev->buf, ev->buf_len); list_del(&block->list); free(block); } static void zk_handle_notify(struct zk_event *ev) { sd_debug("NOTIFY"); sd_notify_handler(&ev->sender.node, ev->buf, ev->buf_len); } static void zk_handle_update_node(struct zk_event *ev) { struct zk_node *t; struct sd_node *snode = &ev->sender.node; sd_debug("%s", node_to_str(snode)); if (node_eq(snode, &this_node.node)) this_node.node = *snode; sd_read_lock(&zk_tree_lock); t = zk_tree_search_nolock(&snode->nid); assert(t); t->node = *snode; build_node_list(); sd_unlock(&zk_tree_lock); sd_update_node_handler(snode); } static void (*const zk_event_handlers[])(struct zk_event *ev) = { [EVENT_JOIN] = zk_handle_join, [EVENT_ACCEPT] = zk_handle_accept, [EVENT_LEAVE] = zk_handle_leave, [EVENT_BLOCK] = zk_handle_block, [EVENT_UNBLOCK] = zk_handle_unblock, [EVENT_NOTIFY] = zk_handle_notify, [EVENT_UPDATE_NODE] = zk_handle_update_node, }; static const int zk_max_event_handlers = ARRAY_SIZE(zk_event_handlers); /* * This method should be done in main thread and triggered when zk_watcher() * receives a session timeout event. * All other zk operations who receive 'ZINVALIDSTATE' return code should drop * control of main thread as soon as possible. So that this method can be * executed and re-establish a new session with zookeeper server. */ static inline void handle_session_expire(void) { /* clean memory states */ close(efd); zk_tree_destroy(); INIT_RB_ROOT(&zk_node_root); INIT_LIST_HEAD(&zk_block_list); nr_sd_nodes = 0; first_push = true; joined = false; memset(sd_nodes, 0, sizeof(struct sd_node) * SD_MAX_NODES); while (sd_reconnect_handler()) { sd_err("failed to reconnect. sleep and retry..."); sleep(1); } } static void zk_event_handler(int listen_fd, int events, void *data) { struct zk_event ev; bool peek; sd_debug("%d, %d", events, queue_pos); if (events & EPOLLHUP) { sd_err("zookeeper driver received EPOLLHUP event, exiting."); log_close(); exit(1); } eventfd_xread(efd); if (zoo_state(zhandle) == ZOO_EXPIRED_SESSION_STATE) { sd_err("detect a session timeout. reconnecting..."); handle_session_expire(); sd_info("reconnected"); eventfd_xwrite(efd, 1); return; } RETURN_VOID_IF_ERROR(zk_queue_peek(&peek), ""); if (!peek) goto kick_block_event; RETURN_VOID_IF_ERROR(zk_queue_pop_advance(&ev), ""); if (ev.type < zk_max_event_handlers && zk_event_handlers[ev.type]) zk_event_handlers[ev.type](&ev); else panic("unhandled type %d", ev.type); RETURN_VOID_IF_ERROR(zk_queue_peek(&peek), ""); if (peek) { /* Someone has created next event, go kick event handler. */ eventfd_xwrite(efd, 1); return; } kick_block_event: /* * Kick block event only if there is no nonblock event. We perfer to * handle nonblock event becasue: * * 1. Sheep assuems that unblock() and notify() is a transaction, so we * can only kick next block event after sd_notify_handler() is called * 2. We should process leave/join event as soon as possible. */ kick_block_event(); } static int zk_init(const char *option) { char *hosts, *to, *p; int ret, timeout = SESSION_TIMEOUT; if (!option) { sd_err("You must specify zookeeper servers."); return -1; } hosts = strtok((char *)option, "="); if ((to = strtok(NULL, "="))) { if (sscanf(to, "%u", &timeout) != 1) { sd_err("Invalid paramter for timeout"); return -1; } p = strstr(hosts, "timeout"); *--p = '\0'; } sd_debug("version %d.%d.%d, address %s, timeout %d", ZOO_MAJOR_VERSION, ZOO_MINOR_VERSION, ZOO_PATCH_VERSION, hosts, timeout); zhandle = zookeeper_init(hosts, zk_watcher, timeout, NULL, NULL, 0); if (!zhandle) { sd_err("failed to connect to zk server %s", option); return -1; } uatomic_set_false(&stop); uatomic_set_false(&is_master); if (zk_queue_init() != ZOK) return -1; efd = eventfd(0, EFD_NONBLOCK); if (efd < 0) { sd_err("failed to create an event fd: %m"); return -1; } ret = register_event(efd, zk_event_handler, NULL); if (ret) { sd_err("failed to register zookeeper event handler (%d)", ret); return -1; } return 0; } static int zk_update_node(struct sd_node *node) { struct zk_node znode = { .node = *node, }; return add_event(EVENT_UPDATE_NODE, &znode, NULL, 0); } static struct cluster_driver cdrv_zookeeper = { .name = "zookeeper", .init = zk_init, .join = zk_join, .leave = zk_leave, .notify = zk_notify, .block = zk_block, .unblock = zk_unblock, .update_node = zk_update_node, .get_local_addr = get_local_addr, }; cdrv_register(cdrv_zookeeper); sheepdog-0.7.5/sheep/config.c000066400000000000000000000071061223630776600160560ustar00rootroot00000000000000/* * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" #define SD_FORMAT_VERSION 0x0002 #define SD_CONFIG_SIZE 40 static struct sheepdog_config { uint64_t ctime; uint16_t flags; uint8_t copies; uint8_t store[STORE_LEN]; uint8_t __pad[3]; uint16_t version; uint64_t space; } config; char *config_path; #define CONFIG_PATH "/config" static int write_config(void) { int ret; ret = atomic_create_and_write(config_path, (char *)&config, sizeof(config), true); if (ret < 0) { sd_err("atomic_create_and_write() failed"); return SD_RES_EIO; } return SD_RES_SUCCESS; } static void check_tmp_config(void) { int ret; char tmp_config_path[PATH_MAX]; snprintf(tmp_config_path, PATH_MAX, "%s.tmp", config_path); ret = unlink(tmp_config_path); if (!ret || ret != ENOENT) return; sd_info("removed temporal config file"); } int init_config_file(void) { int fd, ret; check_tmp_config(); fd = open(config_path, O_RDONLY); if (fd < 0) { if (errno != ENOENT) { sd_err("failed to read config file, %m"); return -1; } goto create; } ret = xread(fd, &config, sizeof(config)); if (ret == 0) { close(fd); goto create; } if (ret < 0) { sd_err("failed to read config file, %m"); goto out; } if (config.version != SD_FORMAT_VERSION) { sd_err("This sheep version is not compatible with" " the existing data layout, %d", config.version); if (sys->upgrade) { /* upgrade sheep store */ ret = sd_migrate_store(config.version, SD_FORMAT_VERSION); if (ret == 0) { /* reload config file */ ret = xpread(fd, &config, sizeof(config), 0); if (ret != sizeof(config)) { sd_err("failed to reload config file," " %m"); ret = -1; } else ret = 0; } goto out; } sd_err("use '-u' option to upgrade sheep store"); ret = -1; goto out; } ret = 0; get_cluster_config(&sys->cinfo); out: close(fd); return ret; create: config.version = SD_FORMAT_VERSION; if (write_config() != SD_RES_SUCCESS) return -1; return 0; } void init_config_path(const char *base_path) { int len = strlen(base_path) + strlen(CONFIG_PATH) + 1; config_path = xzalloc(len); snprintf(config_path, len, "%s" CONFIG_PATH, base_path); } int set_cluster_config(const struct cluster_info *cinfo) { config.ctime = cinfo->ctime; config.copies = cinfo->nr_copies; config.flags = cinfo->flags; memset(config.store, 0, sizeof(config.store)); pstrcpy((char *)config.store, sizeof(config.store), (char *)cinfo->store); return write_config(); } int get_cluster_config(struct cluster_info *cinfo) { cinfo->ctime = config.ctime; cinfo->nr_copies = config.copies; cinfo->flags = config.flags; memcpy(cinfo->store, config.store, sizeof(config.store)); return SD_RES_SUCCESS; } int set_node_space(uint64_t space) { config.space = space; return write_config(); } int get_node_space(uint64_t *space) { *space = config.space; return SD_RES_SUCCESS; } bool is_cluster_formatted(void) { struct cluster_info cinfo; get_cluster_config(&cinfo); return cinfo.ctime != 0; } static inline __attribute__((used)) void __sd_config_format_build_bug_ons(void) { /* never called, only for checking BUILD_BUG_ON()s */ BUILD_BUG_ON(sizeof(struct sheepdog_config) != SD_CONFIG_SIZE); } sheepdog-0.7.5/sheep/gateway.c000066400000000000000000000205441223630776600162530ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * Copyright (C) 2012-2013 Taobao Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" static inline void gateway_init_fwd_hdr(struct sd_req *fwd, struct sd_req *hdr) { memcpy(fwd, hdr, sizeof(*fwd)); fwd->opcode = gateway_to_peer_opcode(hdr->opcode); fwd->proto_ver = SD_SHEEP_PROTO_VER; } /* * Try our best to read one copy and read local first. * * Return success if any read succeed. We don't call gateway_forward_request() * because we only read once. */ int gateway_read_obj(struct request *req) { int i, ret = SD_RES_SUCCESS; struct sd_req fwd_hdr; struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr; const struct sd_vnode *v; const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; uint64_t oid = req->rq.obj.oid; int nr_copies, j; if (sys->enable_object_cache && !req->local && !bypass_object_cache(req)) { ret = object_cache_handle_request(req); goto out; } nr_copies = get_req_copy_number(req); if (nr_copies == 0) { sd_debug("there is no living nodes"); return SD_RES_HALT; } oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (!vnode_is_local(v)) continue; ret = peer_read_obj(req); if (ret == SD_RES_SUCCESS) goto out; sd_err("local read %"PRIx64" failed, %s", oid, sd_strerror(ret)); break; } /* * Read random copy from cluster for better load balance, useful for * reading base VM's COW objects */ j = random(); for (i = 0; i < nr_copies; i++) { int idx = (i + j) % nr_copies; v = obj_vnodes[idx]; if (vnode_is_local(v)) continue; /* * We need to re-init it because rsp and req share the same * structure. */ gateway_init_fwd_hdr(&fwd_hdr, &req->rq); ret = sheep_exec_req(&v->nid, &fwd_hdr, req->data); if (ret != SD_RES_SUCCESS) continue; /* Read success */ memcpy(&req->rp, rsp, sizeof(*rsp)); break; } out: if (ret == SD_RES_SUCCESS && req->rq.proto_ver < SD_PROTO_VER_TRIM_ZERO_SECTORS) { /* the client doesn't support trimming zero bytes */ untrim_zero_blocks(req->data, req->rp.obj.offset, req->rp.data_length, req->rq.data_length); req->rp.data_length = req->rq.data_length; req->rp.obj.offset = 0; } return ret; } struct write_info_entry { struct pollfd pfd; const struct node_id *nid; struct sockfd *sfd; }; struct write_info { struct write_info_entry ent[SD_MAX_NODES]; int nr_sent; }; static inline void write_info_update(struct write_info *wi, int pos) { sd_debug("%d, %d", wi->nr_sent, pos); wi->nr_sent--; memmove(wi->ent + pos, wi->ent + pos + 1, sizeof(struct write_info_entry) * (wi->nr_sent - pos)); } static inline void finish_one_write(struct write_info *wi, int i) { sockfd_cache_put(wi->ent[i].nid, wi->ent[i].sfd); write_info_update(wi, i); } static inline void finish_one_write_err(struct write_info *wi, int i) { sockfd_cache_del(wi->ent[i].nid, wi->ent[i].sfd); write_info_update(wi, i); } struct pfd_info { struct pollfd pfds[SD_MAX_NODES]; int nr; }; static inline void pfd_info_init(struct write_info *wi, struct pfd_info *pi) { int i; for (i = 0; i < wi->nr_sent; i++) pi->pfds[i] = wi->ent[i].pfd; pi->nr = wi->nr_sent; } /* * Wait for all forward requests completion. * * Even if something goes wrong, we have to wait forward requests completion to * avoid interleaved requests. * * Return error code if any one request fails. */ static int wait_forward_request(struct write_info *wi, struct request *req) { int nr_sent, err_ret = SD_RES_SUCCESS, ret, pollret, i, repeat = MAX_RETRY_COUNT; struct pfd_info pi; struct sd_rsp *rsp = &req->rp; again: pfd_info_init(wi, &pi); pollret = poll(pi.pfds, pi.nr, 1000 * POLL_TIMEOUT); if (pollret < 0) { if (errno == EINTR) goto again; panic("%m"); } else if (pollret == 0) { /* * If IO NIC is down, epoch isn't incremented, so we can't retry * for ever. */ if (sheep_need_retry(req->rq.epoch) && repeat) { repeat--; sd_warn("poll timeout %d, disks of some nodes or " "network is busy. Going to poll-wait again", wi->nr_sent); goto again; } nr_sent = wi->nr_sent; /* XXX Blinedly close all the connections */ for (i = 0; i < nr_sent; i++) sockfd_cache_del(wi->ent[i].nid, wi->ent[i].sfd); return SD_RES_NETWORK_ERROR; } nr_sent = wi->nr_sent; for (i = 0; i < nr_sent; i++) if (pi.pfds[i].revents & POLLIN) break; if (i < nr_sent) { int re = pi.pfds[i].revents; sd_debug("%d, revents %x", i, re); if (re & (POLLERR | POLLHUP | POLLNVAL)) { err_ret = SD_RES_NETWORK_ERROR; finish_one_write_err(wi, i); goto finish_write; } if (do_read(pi.pfds[i].fd, rsp, sizeof(*rsp), sheep_need_retry, req->rq.epoch, MAX_RETRY_COUNT)) { sd_err("remote node might have gone away"); err_ret = SD_RES_NETWORK_ERROR; finish_one_write_err(wi, i); goto finish_write; } ret = rsp->result; if (ret != SD_RES_SUCCESS) { sd_err("fail %"PRIx64", %s", req->rq.obj.oid, sd_strerror(ret)); err_ret = ret; } finish_one_write(wi, i); } finish_write: if (wi->nr_sent > 0) goto again; return err_ret; } static inline void write_info_init(struct write_info *wi, size_t nr_to_send) { int i; for (i = 0; i < nr_to_send; i++) wi->ent[i].pfd.fd = -1; wi->nr_sent = 0; } static inline void write_info_advance(struct write_info *wi, const struct node_id *nid, struct sockfd *sfd) { wi->ent[wi->nr_sent].nid = nid; wi->ent[wi->nr_sent].pfd.fd = sfd->fd; wi->ent[wi->nr_sent].pfd.events = POLLIN; wi->ent[wi->nr_sent].sfd = sfd; wi->nr_sent++; } static int init_target_nodes(struct request *req, uint64_t oid, const struct sd_node **target_nodes) { int nr_to_send; const struct vnode_info *vinfo = req->vinfo; nr_to_send = get_req_copy_number(req); oid_to_nodes(vinfo->vnodes, vinfo->nr_vnodes, oid, nr_to_send, vinfo->nodes, target_nodes); return nr_to_send; } static int gateway_forward_request(struct request *req) { int i, err_ret = SD_RES_SUCCESS, ret, local = -1; unsigned wlen; uint64_t oid = req->rq.obj.oid; int nr_to_send; struct write_info wi; const struct sd_op_template *op; struct sd_req hdr; const struct sd_node *target_nodes[SD_MAX_NODES]; sd_debug("%"PRIx64, oid); gateway_init_fwd_hdr(&hdr, &req->rq); op = get_sd_op(hdr.opcode); wlen = hdr.data_length; nr_to_send = init_target_nodes(req, oid, target_nodes); write_info_init(&wi, nr_to_send); if (nr_to_send == 0) { sd_debug("there is no living nodes"); return SD_RES_HALT; } for (i = 0; i < nr_to_send; i++) { struct sockfd *sfd; const struct node_id *nid; if (node_is_local(target_nodes[i])) { local = i; continue; } nid = &target_nodes[i]->nid; sfd = sockfd_cache_get(nid); if (!sfd) { err_ret = SD_RES_NETWORK_ERROR; break; } ret = send_req(sfd->fd, &hdr, req->data, wlen, sheep_need_retry, req->rq.epoch, MAX_RETRY_COUNT); if (ret) { sockfd_cache_del_node(nid); err_ret = SD_RES_NETWORK_ERROR; sd_debug("fail %d", ret); break; } write_info_advance(&wi, nid, sfd); } if (local != -1 && err_ret == SD_RES_SUCCESS) { assert(op); ret = sheep_do_op_work(op, req); if (ret != SD_RES_SUCCESS) { sd_err("fail to write local %"PRIx64", %s", oid, sd_strerror(ret)); err_ret = ret; } } sd_debug("nr_sent %d, err %x", wi.nr_sent, err_ret); if (wi.nr_sent > 0) { ret = wait_forward_request(&wi, req); if (ret != SD_RES_SUCCESS) err_ret = ret; } return err_ret; } int gateway_write_obj(struct request *req) { uint64_t oid = req->rq.obj.oid; if (oid_is_readonly(oid)) return SD_RES_READONLY; if (!bypass_object_cache(req)) return object_cache_handle_request(req); return gateway_forward_request(req); } int gateway_create_and_write_obj(struct request *req) { uint64_t oid = req->rq.obj.oid; if (oid_is_readonly(oid)) return SD_RES_READONLY; if (!bypass_object_cache(req)) return object_cache_handle_request(req); return gateway_forward_request(req); } int gateway_remove_obj(struct request *req) { return gateway_forward_request(req); } sheepdog-0.7.5/sheep/group.c000066400000000000000000000632511223630776600157500ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" struct node { struct sd_node ent; struct list_head list; }; struct get_vdis_work { struct work work; DECLARE_BITMAP(vdi_inuse, SD_NR_VDIS); struct sd_node joined; size_t nr_members; struct sd_node members[]; }; static pthread_mutex_t wait_vdis_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t wait_vdis_cond = PTHREAD_COND_INITIALIZER; static refcnt_t nr_get_vdis_works; static main_thread(struct vnode_info *) current_vnode_info; static main_thread(struct list_head *) pending_block_list; static main_thread(struct list_head *) pending_notify_list; static int get_zones_nr_from(const struct sd_node *nodes, int nr_nodes) { int nr_zones = 0, i, j; uint32_t zones[SD_MAX_COPIES]; for (i = 0; i < nr_nodes; i++) { /* * Only count zones that actually store data, pure gateways * don't contribute to the redundancy level. */ if (!nodes[i].nr_vnodes) continue; for (j = 0; j < nr_zones; j++) { if (nodes[i].zone == zones[j]) break; } if (j == nr_zones) { zones[nr_zones] = nodes[i].zone; if (++nr_zones == ARRAY_SIZE(zones)) break; } } return nr_zones; } static int get_node_idx(struct vnode_info *vnode_info, struct sd_node *ent) { ent = xbsearch(ent, vnode_info->nodes, vnode_info->nr_nodes, node_cmp); if (!ent) return -1; return ent - vnode_info->nodes; } /* * Grab an additional reference to the passed in vnode info. * * The caller must already hold a reference to vnode_info, this function must * only be used to grab an additional reference from code that wants the * vnode information to outlive the request structure. */ struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info) { refcount_inc(&vnode_info->refcnt); return vnode_info; } /* * Get a reference to the currently active vnode information structure, * this must only be called from the main thread. * This can return NULL if cluster is not started yet. */ main_fn struct vnode_info *get_vnode_info(void) { struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info); if (cur_vinfo == NULL) return NULL; return grab_vnode_info(cur_vinfo); } /* Release a reference to the current vnode information. */ void put_vnode_info(struct vnode_info *vnode_info) { if (vnode_info) { if (refcount_dec(&vnode_info->refcnt) == 0) free(vnode_info); } } struct vnode_info *alloc_vnode_info(const struct sd_node *nodes, size_t nr_nodes) { struct vnode_info *vnode_info; vnode_info = xzalloc(sizeof(*vnode_info)); vnode_info->nr_nodes = nr_nodes; memcpy(vnode_info->nodes, nodes, sizeof(*nodes) * nr_nodes); xqsort(vnode_info->nodes, nr_nodes, node_cmp); recalculate_vnodes(vnode_info->nodes, nr_nodes); vnode_info->nr_vnodes = nodes_to_vnodes(vnode_info->nodes, nr_nodes, vnode_info->vnodes); vnode_info->nr_zones = get_zones_nr_from(nodes, nr_nodes); refcount_set(&vnode_info->refcnt, 1); return vnode_info; } struct vnode_info *get_vnode_info_epoch(uint32_t epoch, struct vnode_info *cur_vinfo) { struct sd_node nodes[SD_MAX_NODES]; int nr_nodes; nr_nodes = epoch_log_read(epoch, nodes, sizeof(nodes)); if (nr_nodes < 0) { nr_nodes = epoch_log_read_remote(epoch, nodes, sizeof(nodes), NULL, cur_vinfo); if (nr_nodes == 0) return NULL; } return alloc_vnode_info(nodes, nr_nodes); } int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp, void *data) { int nr_nodes; struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info); if (cur_vinfo) { nr_nodes = cur_vinfo->nr_nodes; memcpy(data, cur_vinfo->nodes, sizeof(struct sd_node) * nr_nodes); rsp->data_length = nr_nodes * sizeof(struct sd_node); rsp->node.nr_nodes = nr_nodes; rsp->node.local_idx = get_node_idx(cur_vinfo, &sys->this_node); } else { rsp->node.nr_nodes = 0; rsp->node.local_idx = 0; } return SD_RES_SUCCESS; } /* Indicator if a cluster operation is currently running. */ static bool cluster_op_running; static struct vdi_op_message *prepare_cluster_msg(struct request *req, size_t *sizep) { struct vdi_op_message *msg; size_t size; if (has_process_main(req->op) && req->rq.flags & SD_FLAG_CMD_WRITE) /* notify data that was received from the sender */ size = sizeof(*msg) + req->rq.data_length; else /* notify data that was set in process_work */ size = sizeof(*msg) + req->rp.data_length; assert(size <= SD_MAX_EVENT_BUF_SIZE); msg = xzalloc(size); memcpy(&msg->req, &req->rq, sizeof(struct sd_req)); memcpy(&msg->rsp, &req->rp, sizeof(struct sd_rsp)); if (has_process_main(req->op) && size > sizeof(*msg)) memcpy(msg->data, req->data, size - sizeof(*msg)); *sizep = size; return msg; } static void cluster_op_done(struct work *work) { struct request *req = container_of(work, struct request, work); struct vdi_op_message *msg; size_t size; int ret; if (req->status == REQUEST_DROPPED) goto drop; sd_debug("%s (%p)", op_name(req->op), req); msg = prepare_cluster_msg(req, &size); ret = sys->cdrv->unblock(msg, size); if (ret != SD_RES_SUCCESS) { /* * Failed to unblock, shoot myself to let other sheep * unblock the event. * FIXME: handle it gracefully. */ sd_emerg("Failed to unblock, %s, exiting.", sd_strerror(ret)); exit(1); } free(msg); req->status = REQUEST_DONE; return; drop: list_del(&req->pending_list); req->rp.result = SD_RES_CLUSTER_ERROR; put_request(req); cluster_op_running = false; } /* * Perform a blocked cluster operation if we were the node requesting it * and do not have any other operation pending. * * If this method returns false the caller must call the method again for * the same event once it gets notified again. * * Must run in the main thread as it accesses unlocked state like * sys->pending_list. */ main_fn bool sd_block_handler(const struct sd_node *sender) { struct request *req; if (!node_is_local(sender)) return false; if (cluster_op_running) return false; cluster_op_running = true; req = list_first_entry(main_thread_get(pending_block_list), struct request, pending_list); req->work.fn = do_process_work; req->work.done = cluster_op_done; queue_work(sys->block_wqueue, &req->work); req->status = REQUEST_QUEUED; return true; } /* * Execute a cluster operation by letting the cluster driver send it to all * nodes in the cluster. * * Must run in the main thread as it access unlocked state like * sys->pending_list. */ main_fn void queue_cluster_request(struct request *req) { int ret; sd_debug("%s (%p)", op_name(req->op), req); if (has_process_work(req->op)) { ret = sys->cdrv->block(); if (ret != SD_RES_SUCCESS) { sd_err("failed to broadcast block to cluster, %s", sd_strerror(ret)); goto error; } list_add_tail(&req->pending_list, main_thread_get(pending_block_list)); } else { struct vdi_op_message *msg; size_t size; msg = prepare_cluster_msg(req, &size); msg->rsp.result = SD_RES_SUCCESS; ret = sys->cdrv->notify(msg, size); if (ret != SD_RES_SUCCESS) { sd_err("failed to broadcast notify to cluster, %s", sd_strerror(ret)); goto error; } list_add_tail(&req->pending_list, main_thread_get(pending_notify_list)); free(msg); } req->status = REQUEST_INIT; return; error: req->rp.result = ret; put_request(req); } static inline int get_nodes_nr_from(struct list_head *l) { struct node *node; int nr = 0; list_for_each_entry(node, l, list) { nr++; } return nr; } int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len, time_t *timestamp, struct vnode_info *vinfo) { int i, nr, ret; char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)]; nr = vinfo->nr_nodes; for (i = 0; i < nr; i++) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; const struct sd_node *node = vinfo->nodes + i; int nodes_len; if (node_is_local(node)) continue; sd_init_req(&hdr, SD_OP_GET_EPOCH); hdr.data_length = len; hdr.obj.tgt_epoch = epoch; hdr.epoch = sys_epoch(); ret = sheep_exec_req(&node->nid, &hdr, buf); if (ret != SD_RES_SUCCESS) continue; nodes_len = rsp->data_length - sizeof(timestamp); memcpy((void *)nodes, buf, nodes_len); if (timestamp) memcpy(timestamp, buf + nodes_len, sizeof(timestamp)); return nodes_len / sizeof(struct sd_node); } /* * If no node has targeted epoch log, return 0 here to at least * allow reading older epoch logs. */ return 0; } static bool cluster_ctime_check(const struct cluster_info *cinfo) { if (cinfo->epoch == 0 || sys->cinfo.epoch == 0) return true; if (cinfo->ctime != sys->cinfo.ctime) { sd_err("joining node ctime doesn't match: %" PRIu64 " vs %" PRIu64, cinfo->ctime, sys->cinfo.ctime); return false; } return true; } /* * Check whether enough node members are gathered. * * Sheepdog can start automatically if and only if all the members in the latest * epoch are gathered. */ static bool enough_nodes_gathered(struct cluster_info *cinfo, const struct sd_node *joining, const struct sd_node *nodes, size_t nr_nodes) { for (int i = 0; i < cinfo->nr_nodes; i++) { const struct sd_node *key = cinfo->nodes + i, *n; n = xlfind(key, nodes, nr_nodes, node_cmp); if (n == NULL && !node_eq(key, joining)) { sd_debug("%s doesn't join yet", node_to_str(key)); return false; } } sd_debug("all the nodes are gathered, %d, %zd", cinfo->nr_nodes, nr_nodes); return true; } static enum sd_status cluster_wait_check(const struct sd_node *joining, const struct sd_node *nodes, size_t nr_nodes, struct cluster_info *cinfo) { if (!cluster_ctime_check(cinfo)) { sd_debug("joining node is invalid"); return sys->cinfo.status; } if (cinfo->epoch > sys->cinfo.epoch) { sd_debug("joining node has a larger epoch, %" PRIu32 ", %" PRIu32, cinfo->epoch, sys->cinfo.epoch); sys->cinfo = *cinfo; } /* * If we have all members from the last epoch log in the in-memory * node list, we can set the cluster live now. */ if (sys->cinfo.epoch > 0 && enough_nodes_gathered(&sys->cinfo, joining, nodes, nr_nodes)) return SD_STATUS_OK; return sys->cinfo.status; } static int get_vdis_from(struct sd_node *node) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct vdi_state *vs = NULL; int i, ret = SD_RES_SUCCESS; unsigned int rlen; int count; if (node_is_local(node)) goto out; rlen = SD_DATA_OBJ_SIZE; /* FIXME */ vs = xzalloc(rlen); sd_init_req(&hdr, SD_OP_GET_VDI_COPIES); hdr.data_length = rlen; hdr.epoch = sys_epoch(); ret = sheep_exec_req(&node->nid, &hdr, (char *)vs); if (ret != SD_RES_SUCCESS) goto out; count = rsp->data_length / sizeof(*vs); for (i = 0; i < count; i++) { atomic_set_bit(vs[i].vid, sys->vdi_inuse); add_vdi_state(vs[i].vid, vs[i].nr_copies, vs[i].snapshot); } out: free(vs); return ret; } static void do_get_vdis(struct work *work) { struct get_vdis_work *w = container_of(work, struct get_vdis_work, work); int i, ret; if (!node_is_local(&w->joined)) { sd_debug("try to get vdi bitmap from %s", node_to_str(&w->joined)); ret = get_vdis_from(&w->joined); if (ret != SD_RES_SUCCESS) sd_alert("failed to get vdi bitmap from %s", node_to_str(&w->joined)); return; } for (i = 0; i < w->nr_members; i++) { /* We should not fetch vdi_bitmap and copy list from myself */ if (node_is_local(&w->members[i])) continue; sd_debug("try to get vdi bitmap from %s", node_to_str(&w->members[i])); ret = get_vdis_from(&w->members[i]); if (ret != SD_RES_SUCCESS) { /* try to read from another node */ sd_alert("failed to get vdi bitmap from %s", node_to_str(&w->members[i])); continue; } /* * TODO: If the target node has a valid vdi bitmap (the node has * already called do_get_vdis against all the nodes), we can * exit this loop here. */ } } static void get_vdis_done(struct work *work) { struct get_vdis_work *w = container_of(work, struct get_vdis_work, work); pthread_mutex_lock(&wait_vdis_lock); refcount_dec(&nr_get_vdis_works); pthread_cond_broadcast(&wait_vdis_cond); pthread_mutex_unlock(&wait_vdis_lock); free(w); } int inc_and_log_epoch(void) { struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info); if (cur_vinfo) { /* update cluster info to the latest state */ sys->cinfo.nr_nodes = cur_vinfo->nr_nodes; memcpy(sys->cinfo.nodes, cur_vinfo->nodes, sizeof(cur_vinfo->nodes[0]) * cur_vinfo->nr_nodes); } else sys->cinfo.nr_nodes = 0; uatomic_inc(&sys->cinfo.epoch); return update_epoch_log(sys->cinfo.epoch, sys->cinfo.nodes, sys->cinfo.nr_nodes); } static struct vnode_info *alloc_old_vnode_info(const struct sd_node *joined, const struct sd_node *nodes, size_t nr_nodes) { struct sd_node old_nodes[SD_MAX_NODES]; /* exclude the newly added one */ memcpy(old_nodes, nodes, sizeof(*nodes) * nr_nodes); xlremove(joined, old_nodes, &nr_nodes, node_cmp); return alloc_vnode_info(old_nodes, nr_nodes); } static void setup_backend_store(const struct cluster_info *cinfo) { int ret; if (cinfo->store[0] == '\0') return; if (!sd_store) { sd_store = find_store_driver((char *)cinfo->store); if (!sd_store) panic("backend store %s not supported", cinfo->store); ret = sd_store->init(); if (ret != SD_RES_SUCCESS) panic("failed to initialize store"); } /* * We need to purge the stale objects for sheep joining back * after crash */ if (xlfind(&sys->this_node, cinfo->nodes, cinfo->nr_nodes, node_cmp) == NULL) { ret = sd_store->purge_obj(); if (ret != SD_RES_SUCCESS) panic("can't remove stale objects"); } } static void finish_join(const struct sd_node *nodes, size_t nr_nodes) { sockfd_cache_add_group(nodes, nr_nodes); } static void get_vdis(const struct sd_node *nodes, size_t nr_nodes, const struct sd_node *joined) { int array_len = nr_nodes * sizeof(struct sd_node); struct get_vdis_work *w; w = xmalloc(sizeof(*w) + array_len); w->joined = *joined; w->nr_members = nr_nodes; memcpy(w->members, nodes, array_len); refcount_inc(&nr_get_vdis_works); w->work.fn = do_get_vdis; w->work.done = get_vdis_done; queue_work(sys->block_wqueue, &w->work); } void wait_get_vdis_done(void) { sd_debug("waiting for vdi list"); pthread_mutex_lock(&wait_vdis_lock); while (refcount_read(&nr_get_vdis_works) > 0) pthread_cond_wait(&wait_vdis_cond, &wait_vdis_lock); pthread_mutex_unlock(&wait_vdis_lock); sd_debug("vdi list ready"); } void recalculate_vnodes(struct sd_node *nodes, int nr_nodes) { int i, nr_non_gateway_nodes = 0; uint64_t avg_size = 0; float factor; for (i = 0; i < nr_nodes; i++) { if (nodes[i].space) { avg_size += nodes[i].space; nr_non_gateway_nodes++; } } if (!nr_non_gateway_nodes) return; avg_size /= nr_non_gateway_nodes; for (i = 0; i < nr_nodes; i++) { factor = (float)nodes[i].space / (float)avg_size; nodes[i].nr_vnodes = rintf(SD_DEFAULT_VNODES * factor); sd_debug("node %d has %d vnodes, free space %" PRIu64, nodes[i].nid.port, nodes[i].nr_vnodes, nodes[i].space); } } static void update_cluster_info(const struct cluster_info *cinfo, const struct sd_node *joined, const struct sd_node *nodes, size_t nr_nodes) { struct vnode_info *old_vnode_info; sd_debug("status = %d, epoch = %d", cinfo->status, cinfo->epoch); if (!sys->gateway_only) setup_backend_store(cinfo); if (node_is_local(joined)) finish_join(nodes, nr_nodes); old_vnode_info = main_thread_get(current_vnode_info); main_thread_set(current_vnode_info, alloc_vnode_info(nodes, nr_nodes)); get_vdis(nodes, nr_nodes, joined); if (cinfo->status == SD_STATUS_OK) { if (!is_cluster_formatted()) /* initialize config file */ set_cluster_config(&sys->cinfo); if (nr_nodes != cinfo->nr_nodes) { int ret = inc_and_log_epoch(); if (ret != 0) panic("cannot log current epoch %d", sys->cinfo.epoch); if (!old_vnode_info) { old_vnode_info = alloc_old_vnode_info(joined, nodes, nr_nodes); } start_recovery(main_thread_get(current_vnode_info), old_vnode_info, true); } else start_recovery(main_thread_get(current_vnode_info), main_thread_get(current_vnode_info), false); } put_vnode_info(old_vnode_info); sockfd_cache_add(&joined->nid); } /* * Pass on a notification message from the cluster driver. * * Must run in the main thread as it accesses unlocked state like * sys->pending_list. */ main_fn void sd_notify_handler(const struct sd_node *sender, void *data, size_t data_len) { struct vdi_op_message *msg = data; const struct sd_op_template *op = get_sd_op(msg->req.opcode); int ret = msg->rsp.result; struct request *req = NULL; sd_debug("op %s, size: %zu, from: %s", op_name(op), data_len, node_to_str(sender)); if (node_is_local(sender)) { if (has_process_work(op)) req = list_first_entry( main_thread_get(pending_block_list), struct request, pending_list); else req = list_first_entry( main_thread_get(pending_notify_list), struct request, pending_list); list_del(&req->pending_list); } if (ret == SD_RES_SUCCESS && has_process_main(op)) ret = do_process_main(op, &msg->req, &msg->rsp, msg->data); if (req) { msg->rsp.result = ret; if (has_process_main(req->op) && !(req->rq.flags & SD_FLAG_CMD_WRITE)) memcpy(req->data, msg->data, msg->rsp.data_length); memcpy(&req->rp, &msg->rsp, sizeof(req->rp)); put_request(req); } if (has_process_work(op)) cluster_op_running = false; } /* * Accept the joining node and pass the cluster info to it. * * Note that 'nodes' doesn't contain 'joining'. * * Return true if the joining node is accepted. At least one nodes in the * cluster must call this function and succeed in accept of the joining node. */ main_fn bool sd_join_handler(const struct sd_node *joining, const struct sd_node *nodes, size_t nr_nodes, void *opaque) { struct cluster_info *cinfo = opaque; enum sd_status status; /* * If nr_nodes is 0, the joining node is the first member of the cluster * and joins sheepdog successfully without any check. If nr_nodes is * not 0, the joining node has to wait for another node to accept it. */ if (nr_nodes > 0 && node_is_local(joining)) { sd_debug("wait for another node to accept this node"); return false; } sd_debug("check %s, %d", node_to_str(joining), sys->cinfo.status); if (sys->cinfo.status == SD_STATUS_WAIT) status = cluster_wait_check(joining, nodes, nr_nodes, cinfo); else status = sys->cinfo.status; *cinfo = sys->cinfo; cinfo->status = status; cinfo->proto_ver = SD_SHEEP_PROTO_VER; sd_debug("%s: cluster_status = 0x%x", addr_to_str(joining->nid.addr, joining->nid.port), cinfo->status); return true; } static int send_join_request(struct sd_node *ent) { sd_info("%s", node_to_str(&sys->this_node)); return sys->cdrv->join(ent, &sys->cinfo, sizeof(sys->cinfo)); } static void requeue_cluster_request(void) { struct request *req, *p; struct vdi_op_message *msg; size_t size; list_for_each_entry_safe(req, p, main_thread_get(pending_notify_list), pending_list) { /* * ->notify() was called and succeeded but after that * this node session-timeouted and sd_notify_handler * wasn't called from notify event handler in cluster * driver. We manually call sd_notify_handler to finish * the request. */ sd_debug("finish pending notify request, op: %s", op_name(req->op)); msg = prepare_cluster_msg(req, &size); sd_notify_handler(&sys->this_node, msg, size); free(msg); } list_for_each_entry_safe(req, p, main_thread_get(pending_block_list), pending_list) { switch (req->status) { case REQUEST_INIT: /* this request has never been executed, re-queue it */ sd_debug("requeue a block request, op: %s", op_name(req->op)); list_del(&req->pending_list); queue_cluster_request(req); break; case REQUEST_QUEUED: /* * This request is being handled by the 'block' thread * and ->unblock() isn't called yet. We can't call * ->unblock thereafter because other sheep has * unblocked themselves due to cluster driver session * timeout. Mark it as dropped to stop cluster_op_done() * from calling ->unblock. */ sd_debug("drop pending block request, op: %s", op_name(req->op)); req->status = REQUEST_DROPPED; break; case REQUEST_DONE: /* * ->unblock() was called and succeeded but after that * this node session-timeouted and sd_notify_handler * wasn't called from unblock event handler in cluster * driver. We manually call sd_notify_handler to finish * the request. */ sd_debug("finish pending block request, op: %s", op_name(req->op)); msg = prepare_cluster_msg(req, &size); sd_notify_handler(&sys->this_node, msg, size); free(msg); break; default: break; } } } main_fn int sd_reconnect_handler(void) { sys->cinfo.status = SD_STATUS_WAIT; if (sys->cdrv->init(sys->cdrv_option) != 0) return -1; if (send_join_request(&sys->this_node) != 0) return -1; requeue_cluster_request(); return 0; } static bool cluster_join_check(const struct cluster_info *cinfo) { if (cinfo->proto_ver != SD_SHEEP_PROTO_VER) { sd_err("invalid protocol version: %d, %d", cinfo->proto_ver, SD_SHEEP_PROTO_VER); return false; } if (!cluster_ctime_check(cinfo)) return false; if (cinfo->epoch == sys->cinfo.epoch && memcmp(cinfo->nodes, sys->cinfo.nodes, sizeof(cinfo->nodes[0]) * cinfo->nr_nodes) != 0) { sd_alert("epoch log entries does not match"); return false; } return true; } main_fn void sd_accept_handler(const struct sd_node *joined, const struct sd_node *members, size_t nr_members, const void *opaque) { int i; const struct cluster_info *cinfo = opaque; if (!cluster_join_check(cinfo)) { sd_err("failed to join Sheepdog"); exit(1); } sys->cinfo = *cinfo; sd_debug("join %s", node_to_str(joined)); for (i = 0; i < nr_members; i++) sd_debug("[%x] %s", i, node_to_str(members + i)); if (sys->cinfo.status == SD_STATUS_SHUTDOWN) return; update_cluster_info(cinfo, joined, members, nr_members); if (node_is_local(joined)) /* this output is used for testing */ sd_debug("join Sheepdog cluster"); } main_fn void sd_leave_handler(const struct sd_node *left, const struct sd_node *members, size_t nr_members) { struct vnode_info *old_vnode_info; int i, ret; sd_debug("leave %s", node_to_str(left)); for (i = 0; i < nr_members; i++) sd_debug("[%x] %s", i, node_to_str(members + i)); if (sys->cinfo.status == SD_STATUS_SHUTDOWN) return; if (node_is_local(left)) /* Mark leave node as gateway only node */ sys->this_node.nr_vnodes = 0; old_vnode_info = main_thread_get(current_vnode_info); main_thread_set(current_vnode_info, alloc_vnode_info(members, nr_members)); if (sys->cinfo.status == SD_STATUS_OK) { ret = inc_and_log_epoch(); if (ret != 0) panic("cannot log current epoch %d", sys->cinfo.epoch); start_recovery(main_thread_get(current_vnode_info), old_vnode_info, true); } put_vnode_info(old_vnode_info); sockfd_cache_del_node(&left->nid); } static void update_node_size(struct sd_node *node) { struct vnode_info *cur_vinfo = main_thread_get(current_vnode_info); int idx = get_node_idx(cur_vinfo, node); assert(idx != -1); cur_vinfo->nodes[idx].space = node->space; } static void kick_node_recover(void) { struct vnode_info *old = main_thread_get(current_vnode_info); int ret; main_thread_set(current_vnode_info, alloc_vnode_info(old->nodes, old->nr_nodes)); ret = inc_and_log_epoch(); if (ret != 0) panic("cannot log current epoch %d", sys->cinfo.epoch); start_recovery(main_thread_get(current_vnode_info), old, true); put_vnode_info(old); } main_fn void sd_update_node_handler(struct sd_node *node) { update_node_size(node); kick_node_recover(); } int create_cluster(int port, int64_t zone, int nr_vnodes, bool explicit_addr) { int ret; if (!sys->cdrv) { sys->cdrv = find_cdrv(DEFAULT_CLUSTER_DRIVER); sd_debug("use %s cluster driver as default", DEFAULT_CLUSTER_DRIVER); } ret = sys->cdrv->init(sys->cdrv_option); if (ret < 0) return -1; if (!explicit_addr) { ret = sys->cdrv->get_local_addr(sys->this_node.nid.addr); if (ret < 0) return -1; } sys->this_node.nid.port = port; sys->this_node.nr_vnodes = nr_vnodes; if (zone == -1) { /* use last 4 bytes as zone id */ uint8_t *b = sys->this_node.nid.addr + 12; sys->this_node.zone = b[0] | b[1] << 8 | b[2] << 16 | b[3] << 24; } else sys->this_node.zone = zone; sd_debug("zone id = %u", sys->this_node.zone); sys->this_node.space = sys->disk_space; sys->cinfo.epoch = get_latest_epoch(); if (sys->cinfo.epoch) { sys->cinfo.nr_nodes = epoch_log_read(sys->cinfo.epoch, sys->cinfo.nodes, sizeof(sys->cinfo.nodes)); if (sys->cinfo.nr_nodes == -1) return -1; } sys->cinfo.status = SD_STATUS_WAIT; main_thread_set(pending_block_list, xzalloc(sizeof(struct list_head))); INIT_LIST_HEAD(main_thread_get(pending_block_list)); main_thread_set(pending_notify_list, xzalloc(sizeof(struct list_head))); INIT_LIST_HEAD(main_thread_get(pending_notify_list)); INIT_LIST_HEAD(&sys->local_req_queue); INIT_LIST_HEAD(&sys->req_wait_queue); ret = send_join_request(&sys->this_node); if (ret != 0) return -1; return 0; } /* * We will call this function for two reason: * 1) make this node working as a gateway, or * 2) the program is going to shutdown itself. */ int leave_cluster(void) { static bool left; if (left) return 0; left = true; return sys->cdrv->leave(); } sheepdog-0.7.5/sheep/http.c000066400000000000000000000161461223630776600155740ustar00rootroot00000000000000/* * Copyright (C) 2013 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /* This files implement RESTful interface to sheepdog storage via fastcgi */ #include #include "sheep_priv.h" struct http_request { FCGX_Request fcgx; int opcode; char *data; size_t data_length; }; enum http_opcode { HTTP_GET = 1, HTTP_PUT, HTTP_POST, HTTP_DELETE, HTTP_HEAD, }; enum http_status { OK = 1, /* 200 */ CREATED, /* 201 */ PARTIAL_CONTENT, /* 206 */ BAD_REQUEST, /* 400 */ NOT_FOUND, /* 404 */ REQUEST_RANGE_NOT_SATISFIABLE, /* 416 */ INTERNAL_SERVER_ERROR, /* 500 */ NOT_IMPLEMENTED, /* 501 */ }; static inline const char *strstatus(int status) { static const char *const descs[] = { [OK] = "200 OK", [CREATED] = "201 CREATED", [PARTIAL_CONTENT] = "206 Partial Content", [BAD_REQUEST] = "400 Bad Request", [NOT_FOUND] = "404 Not Found", [REQUEST_RANGE_NOT_SATISFIABLE] = "416 Requested Range Not Satisfiable", [INTERNAL_SERVER_ERROR] = "500 Internal Server Error", [NOT_IMPLEMENTED] = "501 Not Implemented", }; if (descs[status] == NULL) { static __thread char msg[32]; snprintf(msg, sizeof(msg), "Invalid Status %d", status); return msg; } return descs[status]; } struct http_work { struct work work; struct http_request *request; }; static inline int http_request_error(struct http_request *req) { int ret = FCGX_GetError(req->fcgx.out); if (ret == 0) { return OK; } else if (ret < 0) { sd_err("failed, FCGI error %d", ret); return INTERNAL_SERVER_ERROR; } else { sd_err("failed, %s", strerror(ret)); return INTERNAL_SERVER_ERROR; } } static inline int http_request_write(struct http_request *req, const char *buf, int len) { int ret = FCGX_PutStr(buf, len, req->fcgx.out); if (ret < 0) return http_request_error(req); return OK; } static inline int http_request_read(struct http_request *req, char *buf, int len) { int ret = FCGX_GetStr(buf, len, req->fcgx.in); if (ret < 0) return http_request_error(req); return OK; } static inline int http_request_writes(struct http_request *req, const char *str) { int ret = FCGX_PutS(str, req->fcgx.out); if (ret < 0) return http_request_error(req); return OK; } __printf(2, 3) static int http_request_writef(struct http_request *req, const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = FCGX_VFPrintF(req->fcgx.out, fmt, ap); va_end(ap); if (ret < 0) return http_request_error(req); return OK; } static int request_init_operation(struct http_request *req) { char **env = req->fcgx.envp; char *p; p = FCGX_GetParam("REQUEST_METHOD", env); if (!strcmp(p, "PUT")) { req->opcode = HTTP_PUT; p = FCGX_GetParam("CONTENT_LENGTH", env); req->data_length = strtoll(p, NULL, 10); req->data = xmalloc(req->data_length); http_request_read(req, req->data, req->data_length); } else if (!strcmp(p, "GET")) { req->opcode = HTTP_GET; } else if (!strcmp(p, "POST")) { req->opcode = HTTP_POST; } else if (!strcmp(p, "DELETE")) { req->opcode = HTTP_DELETE; } else if (!strcmp(p, "HEAD")) { req->opcode = HTTP_HEAD; } else { return BAD_REQUEST; } return OK; } static int http_init_request(struct http_request *req) { char *p; int ret; for (int i = 0; (p = req->fcgx.envp[i]); ++i) sd_debug("%s", p); ret = request_init_operation(req); if (ret != OK) return ret; return OK; } static void http_response_header(struct http_request *req, int status) { http_request_writef(req, "Status: %s\n", strstatus(status)); http_request_writes(req, "Content-type: text/plain;\r\n\r\n"); } static void http_handle_get(struct http_request *req) { http_response_header(req, NOT_IMPLEMENTED); http_request_writes(req, "not implemented\n"); } static void http_handle_put(struct http_request *req) { http_response_header(req, NOT_IMPLEMENTED); http_request_writes(req, "not implemented\n"); } static void http_handle_post(struct http_request *req) { http_response_header(req, NOT_IMPLEMENTED); http_request_writes(req, "not implemented\n"); } static void http_handle_delete(struct http_request *req) { http_response_header(req, NOT_IMPLEMENTED); http_request_writes(req, "not implemented\n"); } static void http_handle_head(struct http_request *req) { http_response_header(req, NOT_IMPLEMENTED); http_request_writes(req, "not implemented\n"); } static void (*const http_request_handlers[])(struct http_request *req) = { [HTTP_GET] = http_handle_get, [HTTP_PUT] = http_handle_put, [HTTP_POST] = http_handle_post, [HTTP_DELETE] = http_handle_delete, [HTTP_HEAD] = http_handle_head, }; static const int http_max_request_handlers = ARRAY_SIZE(http_request_handlers); static void http_end_request(struct http_request *req) { FCGX_Finish_r(&req->fcgx); free(req->data); free(req); } static void http_run_request(struct work *work) { struct http_work *hw = container_of(work, struct http_work, work); struct http_request *req = hw->request; int op = req->opcode; if (op < http_max_request_handlers && http_request_handlers[op]) http_request_handlers[op](req); else panic("unhandled opcode %d", op); http_end_request(req); } static void http_request_done(struct work *work) { struct http_work *hw = container_of(work, struct http_work, work); free(hw); } static void http_queue_request(struct http_request *req) { struct http_work *hw = xmalloc(sizeof(*hw)); hw->work.fn = http_run_request; hw->work.done = http_request_done; hw->request = req; queue_work(sys->http_wqueue, &hw->work); } static inline struct http_request *http_new_request(int sockfd) { struct http_request *req = xzalloc(sizeof(*req)); FCGX_InitRequest(&req->fcgx, sockfd, 0); return req; } static int http_sockfd; static void *http_main_loop(void *ignored) { int err; for (;;) { struct http_request *req = http_new_request(http_sockfd); int ret; ret = FCGX_Accept_r(&req->fcgx); if (ret < 0) { sd_err("accept failed, %d, %d", http_sockfd, ret); goto out; } ret = http_init_request(req); if (ret != OK) { http_response_header(req, ret); http_end_request(req); continue; } http_queue_request(req); } out: err = pthread_detach(pthread_self()); if (err) sd_err("%s", strerror(err)); pthread_exit(NULL); } int http_init(const char *address) { pthread_t t; int err; sys->http_wqueue = create_work_queue("http", WQ_DYNAMIC); if (!sys->http_wqueue) return -1; FCGX_Init(); #define LISTEN_QUEUE_DEPTH 1024 /* No rationale */ http_sockfd = FCGX_OpenSocket(address, LISTEN_QUEUE_DEPTH); if (http_sockfd < 0) { sd_err("open socket failed, address %s", address); return -1; } sd_info("http service listen at %s", address); err = pthread_create(&t, NULL, http_main_loop, NULL); if (err) { sd_err("%s", strerror(err)); return -1; } return 0; } sheepdog-0.7.5/sheep/journal.c000066400000000000000000000234651223630776600162710ustar00rootroot00000000000000/* * Copyright (C) 2012 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" struct journal_file { int fd; off_t pos; int commit_fd; uatomic_bool in_commit; }; /* * CAUTION: This definition of struct journal_descriptor must be same * to the definition in tests/dynamorio/journaling/journaling.c. We * have to update the definition in the DR client definition if we * update the below definition because there's no technique for * keeping the consistency automatically. */ struct journal_descriptor { uint32_t magic; uint16_t flag; uint16_t reserved; union { uint32_t epoch; uint64_t oid; }; uint64_t offset; uint64_t size; uint8_t create; uint8_t pad[475]; } __packed; /* JOURNAL_DESC + JOURNAL_MARKER must be 512 algined for DIO */ #define JOURNAL_DESC_MAGIC 0xfee1900d #define JOURNAL_DESC_SIZE 508 #define JOURNAL_MARKER_SIZE 4 /* Use marker to detect partial write */ #define JOURNAL_META_SIZE (JOURNAL_DESC_SIZE + JOURNAL_MARKER_SIZE) #define JOURNAL_END_MARKER 0xdeadbeef #define JF_STORE 0 #define JF_REMOVE_OBJ 2 static const char *jfile_name[2] = { "journal_file0", "journal_file1", }; static int jfile_fds[2]; static size_t jfile_size; static struct journal_file jfile; static pthread_spinlock_t jfile_lock; static int create_journal_file(const char *root, const char *name) { int fd, flags = O_DSYNC | O_RDWR | O_TRUNC | O_CREAT | O_DIRECT; char path[PATH_MAX]; snprintf(path, sizeof(path), "%s/%s", root, name); fd = open(path, flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); if (fd < 0) { sd_err("open %s %m", name); return -1; } if (prealloc(fd, jfile_size) < 0) { sd_err("prealloc %s %m", name); return -1; } return fd; } /* We should have two valid FDs, otherwise something goes wrong */ static int get_old_new_jfile(const char *p, int *old, int *new) { int fd1, fd2; int flags = O_RDONLY; char path[PATH_MAX]; struct stat st1, st2; snprintf(path, sizeof(path), "%s/%s", p, jfile_name[0]); fd1 = open(path, flags); if (fd1 < 0) { if (errno == ENOENT) return 0; sd_err("open1 %m"); return -1; } snprintf(path, sizeof(path), "%s/%s", p, jfile_name[1]); fd2 = open(path, flags); if (fd2 < 0) { sd_err("open2 %m"); close(fd1); return -1; } if (fstat(fd1, &st1) < 0 || fstat(fd2, &st2) < 0) { sd_err("stat %m"); goto out; } if (st1.st_mtime < st2.st_mtime) { *old = fd1; *new = fd2; } else { *old = fd2; *new = fd1; } return 0; out: close(fd1); close(fd2); return -1; } static bool journal_entry_full_write(struct journal_descriptor *jd) { char *end = (char *)jd + round_up(jd->size, SECTOR_SIZE) + JOURNAL_META_SIZE; uint32_t marker = *(((uint32_t *)end) - 1); if (marker != JOURNAL_END_MARKER) return false; return true; } static int replay_journal_entry(struct journal_descriptor *jd) { char path[PATH_MAX]; ssize_t size; int fd, flags = O_WRONLY, ret = 0; void *buf = NULL; char *p = (char *)jd; snprintf(path, PATH_MAX, "%s/%016"PRIx64, md_get_object_path(jd->oid), jd->oid); if (jd->flag == JF_REMOVE_OBJ) { sd_info("%s (remove)", path); unlink(path); return 0; } sd_info("%s, size %" PRIu64 ", off %" PRIu64 ", %d", path, jd->size, jd->offset, jd->create); if (jd->create) flags |= O_CREAT; fd = open(path, flags, sd_def_fmode); if (fd < 0) { sd_err("open %m"); return -1; } if (jd->create) { ret = prealloc(fd, get_objsize(jd->oid)); if (ret < 0) goto out; } buf = xmalloc(jd->size); p += JOURNAL_DESC_SIZE; memcpy(buf, p, jd->size); size = xpwrite(fd, buf, jd->size, jd->offset); if (size != jd->size) { sd_err("write %zd, size %" PRIu64 ", errno %m", size, jd->size); ret = -1; goto out; } out: free(buf); close(fd); return ret; } static int do_recover(int fd) { struct journal_descriptor *jd; void *map; char *p, *end; struct stat st; if (fstat(fd, &st) < 0) { sd_err("fstat %m"); return -1; } if (!st.st_size) { /* * An empty journal file can be produced when sheep crashes * between ftruncate() and prealloc() of commit_data(). * Such a file should be ignored simply. */ close(fd); return 0; } map = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0); close(fd); if (map == MAP_FAILED) { sd_err("%m"); return -1; } end = (char *)map + st.st_size; for (p = map; p < end;) { jd = (struct journal_descriptor *)p; if (jd->magic != JOURNAL_DESC_MAGIC) { /* Empty area */ p += SECTOR_SIZE; continue; } /* We skip partial write because it is not acked back to VM */ if (!journal_entry_full_write(jd)) goto skip; if (replay_journal_entry(jd) < 0) return -1; skip: p += JOURNAL_META_SIZE + round_up(jd->size, SECTOR_SIZE); } munmap(map, st.st_size); /* Do a final sync() to assure data is reached to the disk */ sync(); return 0; } /* * We recover the journal file in order of wall time in the corner case that * sheep crashes while in the middle of journal committing. For most of cases, * we actually only recover one jfile, the other would be empty. This process * is fast with buffered IO that only take several secends at most. */ static void check_recover_journal_file(const char *p) { int old = 0, new = 0; if (get_old_new_jfile(p, &old, &new) < 0) return; /* No journal file found */ if (old == 0) return; if (do_recover(old) < 0) panic("recoverying from journal file (old) failed"); if (do_recover(new) < 0) panic("recoverying from journal file (new) failed"); } int journal_file_init(const char *path, size_t size, bool skip) { int fd; if (!skip) check_recover_journal_file(path); jfile_size = (size * 1024 * 1024) / 2; fd = create_journal_file(path, jfile_name[0]); if (fd < 0) return -1; jfile.fd = jfile_fds[0] = fd; fd = create_journal_file(path, jfile_name[1]); jfile_fds[1] = fd; pthread_spin_init(&jfile_lock, PTHREAD_PROCESS_PRIVATE); return 0; } void clean_journal_file(const char *p) { int ret; char path[PATH_MAX]; sync(); snprintf(path, sizeof(path), "%s/%s", p, jfile_name[0]); ret = unlink(path); if (ret < 0) sd_err("unlink(%s): %m", path); snprintf(path, sizeof(path), "%s/%s", p, jfile_name[1]); ret = unlink(path); if (ret < 0) sd_err("unlink(%s): %m", path); } static inline bool jfile_enough_space(size_t size) { if (jfile.pos + size > jfile_size) return false; return true; } /* * We rely on the kernel's page cache to cache data objects to 1) boost read * perfmance 2) simplify read path so that data commiting is simply a * sync() operation and We do it in a dedicated thread to avoid blocking * the writer by switch back and forth between two journal files. */ static void *commit_data(void *ignored) { int err; /* Tell runtime to release resources after termination */ err = pthread_detach(pthread_self()); if (unlikely(err)) panic("%s", strerror(err)); sync(); if (unlikely(xftruncate(jfile.commit_fd, 0) < 0)) panic("truncate %m"); if (unlikely(prealloc(jfile.commit_fd, jfile_size) < 0)) panic("prealloc"); uatomic_set_false(&jfile.in_commit); pthread_exit(NULL); } /* FIXME: Try not sleep inside lock */ static void switch_journal_file(void) { int old = jfile.fd, err; pthread_t thread; retry: if (unlikely(!uatomic_set_true(&jfile.in_commit))) { sd_err("journal file in committing, " "you might need enlarge jfile size"); usleep(100000); /* Wait until committing is finished */ goto retry; } if (old == jfile_fds[0]) jfile.fd = jfile_fds[1]; else jfile.fd = jfile_fds[0]; jfile.commit_fd = old; jfile.pos = 0; err = pthread_create(&thread, NULL, commit_data, NULL); if (unlikely(err)) panic("%s", strerror(err)); } static int journal_file_write(struct journal_descriptor *jd, const char *buf) { uint32_t marker = JOURNAL_END_MARKER; int ret = SD_RES_SUCCESS; uint64_t size = jd->size; ssize_t written, rusize = round_up(size, SECTOR_SIZE), wsize = JOURNAL_META_SIZE + rusize; off_t woff; char *wbuffer, *p; pthread_spin_lock(&jfile_lock); if (!jfile_enough_space(wsize)) switch_journal_file(); woff = jfile.pos; jfile.pos += wsize; pthread_spin_unlock(&jfile_lock); p = wbuffer = xvalloc(wsize); memcpy(p, jd, JOURNAL_DESC_SIZE); p += JOURNAL_DESC_SIZE; memcpy(p, buf, size); p += size; if (size < rusize) { memset(p, 0, rusize - size); p += rusize - size; } memcpy(p, &marker, JOURNAL_MARKER_SIZE); /* * Concurrent writes with the same FD is okay because we don't have any * critical sections that need lock inside kernel write path, since we * a) bypass page cache, b) don't modify i_size of this inode. * * Feel free to correct me If I am wrong. */ written = xpwrite(jfile.fd, wbuffer, wsize, woff); if (unlikely(written != wsize)) { sd_err("failed, written %zd, len %zd", written, wsize); /* FIXME: teach journal file handle EIO gracefully */ ret = SD_RES_EIO; goto out; } out: free(wbuffer); return ret; } int journal_write_store(uint64_t oid, const char *buf, size_t size, off_t offset, bool create) { struct journal_descriptor jd = { .magic = JOURNAL_DESC_MAGIC, .flag = JF_STORE, .offset = offset, .size = size, .create = create, }; /* We have to explicitly do assignment to get all GCC compatible */ jd.oid = oid; return journal_file_write(&jd, buf); } int journal_remove_object(uint64_t oid) { struct journal_descriptor jd = { .magic = JOURNAL_DESC_MAGIC, .flag = JF_REMOVE_OBJ, .size = 0, }; jd.oid = oid; return journal_file_write(&jd, NULL); } static __attribute__((used)) void journal_c_build_bug_ons(void) { /* never called, only for checking BUILD_BUG_ON()s */ BUILD_BUG_ON(sizeof(struct journal_descriptor) != JOURNAL_DESC_SIZE); } sheepdog-0.7.5/sheep/md.c000066400000000000000000000335541223630776600152170ustar00rootroot00000000000000/* * Copyright (C) 2013 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" #define MD_DEFAULT_VDISKS 128 #define MD_MAX_VDISK (MD_MAX_DISK * MD_DEFAULT_VDISKS) struct disk { char path[PATH_MAX]; uint16_t nr_vdisks; uint64_t space; }; struct vdisk { uint16_t idx; uint64_t id; }; static struct disk md_disks[MD_MAX_DISK]; static struct vdisk md_vds[MD_MAX_VDISK]; static struct sd_lock md_lock = SD_LOCK_INITIALIZER; static int md_nr_disks; /* Protected by md_lock */ static int md_nr_vds; static inline int nr_online_disks(void) { int nr; sd_read_lock(&md_lock); nr = md_nr_disks; sd_unlock(&md_lock); return nr; } static struct vdisk *oid_to_vdisk_from(struct vdisk *vds, int nr, uint64_t oid) { uint64_t id = fnv_64a_buf(&oid, sizeof(oid), FNV1A_64_INIT); int start, end, pos; start = 0; end = nr - 1; if (id > vds[end].id || id < vds[start].id) return &vds[start]; for (;;) { pos = (end - start) / 2 + start; if (vds[pos].id < id) { if (vds[pos + 1].id >= id) return &vds[pos + 1]; start = pos; } else end = pos; } } static int vdisk_cmp(const struct vdisk *d1, const struct vdisk *d2) { return intcmp(d1->id, d2->id); } static inline int disks_to_vdisks(struct disk *ds, int nmds, struct vdisk *vds) { struct disk *d_iter = ds; int i, j, nr_vdisks = 0; uint64_t hval; while (nmds--) { hval = FNV1A_64_INIT; for (i = 0; i < d_iter->nr_vdisks; i++) { hval = fnv_64a_buf(&nmds, sizeof(nmds), hval); for (j = strlen(d_iter->path) - 1; j >= 0; j--) hval = fnv_64a_buf(&d_iter->path[j], 1, hval); vds[nr_vdisks].id = hval; vds[nr_vdisks].idx = d_iter - ds; nr_vdisks++; } d_iter++; } xqsort(vds, nr_vdisks, vdisk_cmp); return nr_vdisks; } static inline struct vdisk *oid_to_vdisk(uint64_t oid) { return oid_to_vdisk_from(md_vds, md_nr_vds, oid); } static inline void trim_last_slash(char *path) { assert(path[0]); while (path[strlen(path) - 1] == '/') path[strlen(path) - 1] = '\0'; } static int path_to_disk_idx(char *path) { int i; trim_last_slash(path); for (i = 0; i < md_nr_disks; i++) if (strcmp(md_disks[i].path, path) == 0) return i; return -1; } bool md_add_disk(char *path) { if (path_to_disk_idx(path) != -1) { sd_err("duplicate path %s", path); return false; } if (xmkdir(path, sd_def_dmode) < 0) { sd_err("can't mkdir for %s, %m", path); return false; } md_nr_disks++; pstrcpy(md_disks[md_nr_disks - 1].path, PATH_MAX, path); sd_info("%s, nr %d", md_disks[md_nr_disks - 1].path, md_nr_disks); return true; } static inline void calculate_vdisks(struct disk *disks, int nr_disks, uint64_t total) { uint64_t avg_size = total / nr_disks; float factor; int i; for (i = 0; i < nr_disks; i++) { factor = (float)disks[i].space / (float)avg_size; md_disks[i].nr_vdisks = rintf(MD_DEFAULT_VDISKS * factor); sd_debug("%s has %d vdisks, free space %" PRIu64, md_disks[i].path, md_disks[i].nr_vdisks, md_disks[i].space); } } #define MDNAME "user.md.size" #define MDSIZE sizeof(uint64_t) static int get_total_object_size(uint64_t oid, char *wd, uint32_t epoch, void *total) { uint64_t *t = total; struct stat s; char path[PATH_MAX]; snprintf(path, PATH_MAX, "%s/%016" PRIx64, wd, oid); if (stat(path, &s) == 0) *t += s.st_blocks * SECTOR_SIZE; else *t += get_objsize(oid); return SD_RES_SUCCESS; } /* If cleanup is true, temporary objects will be removed */ static int for_each_object_in_path(char *path, int (*func)(uint64_t, char *, uint32_t, void *), bool cleanup, void *arg) { DIR *dir; struct dirent *d; uint64_t oid; int ret = SD_RES_SUCCESS; char p[PATH_MAX]; dir = opendir(path); if (unlikely(!dir)) { sd_err("failed to open %s, %m", path); return SD_RES_EIO; } while ((d = readdir(dir))) { uint32_t epoch = 0; if (unlikely(!strncmp(d->d_name, ".", 1))) continue; oid = strtoull(d->d_name, NULL, 16); if (oid == 0 || oid == ULLONG_MAX) continue; /* don't call callback against temporary objects */ if (strlen(d->d_name) == 20 && strcmp(d->d_name + 16, ".tmp") == 0) { if (cleanup) { snprintf(p, PATH_MAX, "%s/%016"PRIx64".tmp", path, oid); sd_debug("remove tmp object %s", p); unlink(p); } continue; } if (strlen(d->d_name) > 17 && d->d_name[16] == '.') epoch = strtoul(d->d_name + 17, NULL, 10); ret = func(oid, path, epoch, arg); if (ret != SD_RES_SUCCESS) break; } closedir(dir); return ret; } static uint64_t get_path_free_size(char *path, uint64_t *used) { struct statvfs fs; uint64_t size; if (statvfs(path, &fs) < 0) { sd_err("get disk %s space failed %m", path); return 0; } size = (int64_t)fs.f_frsize * fs.f_bavail; if (!used) goto out; if (for_each_object_in_path(path, get_total_object_size, false, used) != SD_RES_SUCCESS) return 0; out: return size; } /* * If path is broken during initilization or not support xattr return 0. We can * safely use 0 to represent failure case because 0 space path can be * considered as broken path. */ static uint64_t init_path_space(char *path) { uint64_t size; char stale[PATH_MAX]; if (!is_xattr_enabled(path)) { sd_info("multi-disk support need xattr feature"); goto broken_path; } snprintf(stale, PATH_MAX, "%s/.stale", path); if (xmkdir(stale, sd_def_dmode) < 0) { sd_err("can't mkdir for %s, %m", stale); goto broken_path; } if (getxattr(path, MDNAME, &size, MDSIZE) < 0) { if (errno == ENODATA) { goto create; } else { sd_err("%s, %m", path); goto broken_path; } } return size; create: size = get_path_free_size(path, NULL); if (!size) goto broken_path; if (setxattr(path, MDNAME, &size, MDSIZE, 0) < 0) { sd_err("%s, %m", path); goto broken_path; } return size; broken_path: return 0; } static inline void md_remove_disk(int idx) { int i; sd_info("%s from multi-disk array", md_disks[idx].path); /* * We need to keep last disk path to generate EIO when all disks are * broken */ for (i = idx; i < md_nr_disks - 1; i++) md_disks[i] = md_disks[i + 1]; md_nr_disks--; } uint64_t md_init_space(void) { uint64_t total; int i; reinit: if (!md_nr_disks) return 0; total = 0; for (i = 0; i < md_nr_disks; i++) { md_disks[i].space = init_path_space(md_disks[i].path); if (!md_disks[i].space) { md_remove_disk(i); goto reinit; } total += md_disks[i].space; } calculate_vdisks(md_disks, md_nr_disks, total); md_nr_vds = disks_to_vdisks(md_disks, md_nr_disks, md_vds); return total; } char *md_get_object_path(uint64_t oid) { struct vdisk *vd; char *p; sd_read_lock(&md_lock); vd = oid_to_vdisk(oid); p = md_disks[vd->idx].path; sd_unlock(&md_lock); sd_debug("%d, %s", vd->idx, p); return p; } static char *md_get_object_path_nolock(uint64_t oid) { struct vdisk *vd; vd = oid_to_vdisk(oid); return md_disks[vd->idx].path; } int for_each_object_in_wd(int (*func)(uint64_t oid, char *path, uint32_t epoch, void *arg), bool cleanup, void *arg) { int i, ret = SD_RES_SUCCESS; sd_read_lock(&md_lock); for (i = 0; i < md_nr_disks; i++) { ret = for_each_object_in_path(md_disks[i].path, func, cleanup, arg); if (ret != SD_RES_SUCCESS) break; } sd_unlock(&md_lock); return ret; } int for_each_object_in_stale(int (*func)(uint64_t oid, char *path, uint32_t epoch, void *arg), void *arg) { int i, ret = SD_RES_SUCCESS; char path[PATH_MAX]; sd_read_lock(&md_lock); for (i = 0; i < md_nr_disks; i++) { snprintf(path, sizeof(path), "%s/.stale", md_disks[i].path); sd_err("%s", path); ret = for_each_object_in_path(path, func, false, arg); if (ret != SD_RES_SUCCESS) break; } sd_unlock(&md_lock); return ret; } int for_each_obj_path(int (*func)(char *path)) { int i, ret = SD_RES_SUCCESS; sd_read_lock(&md_lock); for (i = 0; i < md_nr_disks; i++) { ret = func(md_disks[i].path); if (ret != SD_RES_SUCCESS) break; } sd_unlock(&md_lock); return ret; } struct md_work { struct work work; char path[PATH_MAX]; }; static inline void kick_recover(void) { struct vnode_info *vinfo = get_vnode_info(); start_recovery(vinfo, vinfo, false); put_vnode_info(vinfo); } static void md_do_recover(struct work *work) { struct md_work *mw = container_of(work, struct md_work, work); int idx, nr = 0; sd_write_lock(&md_lock); idx = path_to_disk_idx(mw->path); if (idx < 0) /* Just ignore the duplicate EIO of the same path */ goto out; md_remove_disk(idx); md_init_space(); nr = md_nr_disks; out: sd_unlock(&md_lock); if (nr > 0) kick_recover(); free(mw); } int md_handle_eio(char *fault_path) { struct md_work *mw; if (nr_online_disks() == 0) return SD_RES_EIO; mw = xzalloc(sizeof(*mw)); mw->work.done = md_do_recover; pstrcpy(mw->path, PATH_MAX, fault_path); queue_work(sys->md_wqueue, &mw->work); /* Fool the requester to retry */ return SD_RES_NETWORK_ERROR; } static inline bool md_access(char *path) { if (access(path, R_OK | W_OK) < 0) { if (unlikely(errno != ENOENT)) sd_err("failed to check %s, %m", path); return false; } return true; } static int get_old_new_path(uint64_t oid, uint32_t epoch, char *path, char *old, char *new) { if (!epoch) { snprintf(old, PATH_MAX, "%s/%016" PRIx64, path, oid); snprintf(new, PATH_MAX, "%s/%016" PRIx64, md_get_object_path_nolock(oid), oid); } else { snprintf(old, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32, path, oid, epoch); snprintf(new, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32, md_get_object_path_nolock(oid), oid, epoch); } if (!md_access(old)) return -1; return 0; } static int md_move_object(uint64_t oid, char *old, char *new) { struct strbuf buf = STRBUF_INIT; int fd, ret = -1; size_t sz = get_objsize(oid); fd = open(old, O_RDONLY); if (fd < 0) { sd_err("failed to open %s", old); goto out; } ret = strbuf_read(&buf, fd, sz); if (ret != sz) { sd_err("failed to read %s, %d", old, ret); ret = -1; goto out_close; } if (atomic_create_and_write(new, buf.buf, buf.len, false) < 0) { sd_err("failed to create %s", new); ret = -1; goto out_close; } unlink(old); ret = 0; out_close: close(fd); out: strbuf_release(&buf); return ret; } static int md_check_and_move(uint64_t oid, uint32_t epoch, char *path) { char old[PATH_MAX], new[PATH_MAX]; if (get_old_new_path(oid, epoch, path, old, new) < 0) return SD_RES_EIO; /* * Recovery thread and main thread might try to recover the same object. * Either one succeeds, the other will fail and proceed and end up * trying to move the object to where it is already in place, in this * case we simply return. */ if (!strcmp(old, new)) return SD_RES_SUCCESS; /* We can't use rename(2) accross device */ if (md_move_object(oid, old, new) < 0) { sd_err("move old %s to new %s failed", old, new); return SD_RES_EIO; } sd_debug("from %s to %s", old, new); return SD_RES_SUCCESS; } static int scan_wd(uint64_t oid, uint32_t epoch) { int i, ret = SD_RES_EIO; sd_read_lock(&md_lock); for (i = 0; i < md_nr_disks; i++) { ret = md_check_and_move(oid, epoch, md_disks[i].path); if (ret == SD_RES_SUCCESS) break; } sd_unlock(&md_lock); return ret; } bool md_exist(uint64_t oid) { char path[PATH_MAX]; snprintf(path, PATH_MAX, "%s/%016" PRIx64, md_get_object_path(oid), oid); if (md_access(path)) return true; /* * We have to iterate the WD because we don't have epoch-like history * track to locate the objects for multiple disk failure. Simply do * hard iteration simplify the code a lot. */ if (scan_wd(oid, 0) == SD_RES_SUCCESS) return true; return false; } int md_get_stale_path(uint64_t oid, uint32_t epoch, char *path) { snprintf(path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32, md_get_object_path(oid), oid, epoch); if (md_access(path)) return SD_RES_SUCCESS; assert(epoch); if (scan_wd(oid, epoch) == SD_RES_SUCCESS) return SD_RES_SUCCESS; return SD_RES_NO_OBJ; } uint32_t md_get_info(struct sd_md_info *info) { uint32_t ret = sizeof(*info); int i; memset(info, 0, ret); sd_read_lock(&md_lock); for (i = 0; i < md_nr_disks; i++) { info->disk[i].idx = i; pstrcpy(info->disk[i].path, PATH_MAX, md_disks[i].path); /* FIXME: better handling failure case. */ info->disk[i].free = get_path_free_size(info->disk[i].path, &info->disk[i].used); } info->nr = md_nr_disks; sd_unlock(&md_lock); return ret; } static inline void md_del_disk(char *path) { int idx = path_to_disk_idx(path); if (idx < 0) { sd_err("invalid path %s", path); return; } md_remove_disk(idx); } static int do_plug_unplug(char *disks, bool plug) { char *path; int old_nr, cur_nr = 0, ret = SD_RES_UNKNOWN; sd_write_lock(&md_lock); old_nr = md_nr_disks; path = strtok(disks, ","); do { if (plug) { if (md_add_disk(path) && purge_directory(path) < 0) md_del_disk(path); } else { md_del_disk(path); } } while ((path = strtok(NULL, ","))); /* If no disks change, bail out */ if (old_nr == md_nr_disks) goto out; md_init_space(); cur_nr = md_nr_disks; ret = SD_RES_SUCCESS; out: sd_unlock(&md_lock); /* * We have to kick recover aggressively because there is possibility * that nr of disks are removed during md_init_space() happens to equal * nr of disks we added. */ if (cur_nr > 0 && ret == SD_RES_SUCCESS) kick_recover(); return ret; } int md_plug_disks(char *disks) { return do_plug_unplug(disks, true); } int md_unplug_disks(char *disks) { return do_plug_unplug(disks, false); } uint64_t md_get_size(uint64_t *used) { uint64_t fsize = 0; *used = 0; sd_read_lock(&md_lock); for (int i = 0; i < md_nr_disks; i++) fsize += get_path_free_size(md_disks[i].path, used); sd_unlock(&md_lock); return fsize + *used; } sheepdog-0.7.5/sheep/migrate.c000066400000000000000000000215321223630776600162400ustar00rootroot00000000000000/* * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" /* sheepdog 0.4.0 */ struct node_id_v0 { uint8_t addr[16]; uint16_t port; }; struct sd_node_v0 { struct node_id_v0 nid; uint16_t nr_vnodes; uint32_t zone; }; struct sheepdog_config_v0 { uint64_t ctime; uint16_t flags; uint8_t copies; uint8_t store[STORE_LEN]; }; /* sheepdog 0.5.1 */ struct node_id_v1 { uint8_t addr[16]; uint16_t port; }; struct sd_node_v1 { struct node_id_v1 nid; uint16_t nr_vnodes; uint32_t zone; uint64_t space; }; struct sheepdog_config_v1 { uint64_t ctime; uint16_t flags; uint8_t copies; uint8_t store[STORE_LEN]; uint8_t __pad[3]; uint16_t version; uint64_t space; }; /* sheepdog 0.6.0 */ struct node_id_v2 { uint8_t addr[16]; uint16_t port; uint8_t io_addr[16]; uint16_t io_port; uint8_t pad[4]; }; struct sd_node_v2 { struct node_id_v2 nid; uint16_t nr_vnodes; uint32_t zone; uint64_t space; }; /* sheepdog_config_v2 is the same as v1 */ #define sheepdog_config_v2 sheepdog_config_v1 static size_t get_file_size(const char *path) { struct stat stbuf; int ret; ret = stat(path, &stbuf); if (ret < 0) { sd_err("failed to stat %s, %m", path); return -1; } return stbuf.st_size; } static void for_each_epoch(int (*func)(uint32_t epoch)) { DIR *dir; struct dirent *d; dir = opendir(epoch_path); if (!dir) panic("failed to open %s: %m", epoch_path); while ((d = readdir(dir))) { uint32_t e; char *p; e = strtol(d->d_name, &p, 10); if (d->d_name == p) continue; if (strlen(d->d_name) != 8) continue; if (func(e) != 0) return; } closedir(dir); } /* copy file from 'fname' to 'fname.suffix' */ static int backup_file(char *fname, char *suffix) { char dst_file[PATH_MAX]; int fd = -1, ret = -1, len; void *buf = NULL; snprintf(dst_file, sizeof(dst_file), "%s.%s", fname, suffix); fd = open(fname, O_RDONLY); if (fd < 0) { if (errno != ENOENT) { sd_err("failed to open %s, %m", fname); ret = -1; } else ret = 0; goto out; } len = get_file_size(fname); if (len < 0) goto out; buf = xmalloc(len); ret = xread(fd, buf, len); if (ret != len) { sd_err("failed to read %s, %d %m", fname, ret); ret = -1; goto out; } close(fd); fd = open(dst_file, O_CREAT | O_WRONLY | O_DSYNC, 0644); if (fd < 0) { sd_err("failed to create %s, %m", dst_file); ret = -1; goto out; } ret = xwrite(fd, buf, len); if (ret != len) { sd_err("failed to write to %s, %d %m", dst_file, ret); ret = -1; } out: if (fd >= 0) close(fd); free(buf); return ret; } static int backup_epoch(uint32_t epoch) { char path[PATH_MAX]; char suffix[256]; struct timeval tv; struct tm tm; gettimeofday(&tv, NULL); localtime_r(&tv.tv_sec, &tm); strftime(suffix, sizeof(suffix), "%Y-%m-%d_%H%M%S", &tm); snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch); return backup_file(path, suffix); } /* backup config and epoch info */ static int backup_store(void) { char suffix[256]; struct timeval tv; struct tm tm; int ret; gettimeofday(&tv, NULL); localtime_r(&tv.tv_sec, &tm); strftime(suffix, sizeof(suffix), "%Y-%m-%d_%H%M%S", &tm); ret = backup_file(config_path, suffix); if (ret < 0) return ret; for_each_epoch(backup_epoch); return 0; } static int update_epoch_from_v0_to_v1(uint32_t epoch) { char path[PATH_MAX]; struct sd_node_v0 nodes_v0[SD_MAX_NODES]; struct sd_node_v1 nodes_v1[SD_MAX_NODES]; size_t nr_nodes; time_t *t; int len, fd, ret; snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch); fd = open(path, O_RDWR | O_DSYNC); if (fd < 0) { if (errno == ENOENT) return 0; sd_err("failed to open epoch %"PRIu32" log", epoch); return -1; } ret = xread(fd, nodes_v0, sizeof(nodes_v0)); if (ret < 0) { sd_err("failed to read epoch %"PRIu32" log", epoch); close(fd); return ret; } nr_nodes = ret / sizeof(nodes_v0[0]); for (int i = 0; i < nr_nodes; i++) { memcpy(&nodes_v1[i].nid, &nodes_v0[i].nid, sizeof(struct node_id_v1)); nodes_v1[i].nr_vnodes = nodes_v0[i].nr_vnodes; nodes_v1[i].zone = nodes_v0[i].zone; nodes_v1[i].space = 0; } len = sizeof(nodes_v1[0]) * nr_nodes; ret = xpwrite(fd, nodes_v1, len, 0); if (ret != len) { sd_err("failed to write epoch %"PRIu32" log", epoch); close(fd); return -1; } t = (time_t *)&nodes_v0[nr_nodes]; ret = xpwrite(fd, t, sizeof(*t), len); if (ret != sizeof(*t)) { sd_err("failed to write time to epoch %" PRIu32 " log", epoch); close(fd); return -1; } close(fd); return 0; } static int migrate_from_v0_to_v1(void) { int ret, fd; struct sheepdog_config_v1 config; fd = open(config_path, O_RDWR); if (fd < 0) { sd_err("failed to open config file, %m"); return -1; } memset(&config, 0, sizeof(config)); ret = xread(fd, &config, sizeof(config)); if (ret < 0) { sd_err("failed to read config file, %m"); close(fd); return ret; } config.version = 1; ret = xpwrite(fd, &config, sizeof(config), 0); if (ret != sizeof(config)) { sd_err("failed to write config data, %m"); close(fd); return -1; } /* 0.5.1 could wrongly extend the config file, so truncate it here */ ret = xftruncate(fd, sizeof(config)); if (ret != 0) { sd_err("failed to truncate config data, %m"); close(fd); return -1; } close(fd); /* * If the config file contains a space field, the store layout * is compatible with v1. In this case, what we need to do is * only adding version number to the config file. */ if (config.space > 0) return 0; /* upgrade epoch log */ for_each_epoch(update_epoch_from_v0_to_v1); return ret; } static int update_epoch_from_v1_to_v2(uint32_t epoch) { char path[PATH_MAX]; struct sd_node_v1 nodes_v1[SD_MAX_NODES]; struct sd_node_v2 nodes_v2[SD_MAX_NODES]; size_t nr_nodes; time_t *t; int len, fd, ret; snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch); fd = open(path, O_RDWR | O_DSYNC); if (fd < 0) { if (errno == ENOENT) return 0; sd_err("failed to open epoch %"PRIu32" log", epoch); return -1; } /* * sheepdog 0.5.6 was released without incrementing the config version. * We detect it by 1) checking the size of epoch file, and 2) checking * the value of sd_node.nid.port */ if ((get_file_size(path) - sizeof(time_t)) % sizeof(nodes_v1[0]) != 0) { sd_debug("%s is not a v1 format", path); close(fd); return 0; } ret = xread(fd, nodes_v1, sizeof(nodes_v1)); if (ret < 0) { sd_err("failed to read epoch %"PRIu32" log", epoch); close(fd); return ret; } nr_nodes = ret / sizeof(nodes_v1[0]); for (int i = 0; i < nr_nodes; i++) { if (nodes_v1[i].nid.port == 0) { sd_debug("%s is not a v1 format", path); return 0; } memset(&nodes_v2[i].nid, 0, sizeof(nodes_v2[i].nid)); memcpy(nodes_v2[i].nid.addr, nodes_v1[i].nid.addr, sizeof(nodes_v2[i].nid.addr)); nodes_v2[i].nid.port = nodes_v1[i].nid.port; nodes_v2[i].nr_vnodes = nodes_v1[i].nr_vnodes; nodes_v2[i].zone = nodes_v1[i].zone; nodes_v2[i].space = nodes_v1[i].space; } len = sizeof(nodes_v2[0]) * nr_nodes; ret = xpwrite(fd, nodes_v2, len, 0); if (ret != len) { sd_err("failed to write epoch %"PRIu32" log", epoch); close(fd); return -1; } t = (time_t *)&nodes_v1[nr_nodes]; ret = xpwrite(fd, t, sizeof(*t), len); if (ret != sizeof(*t)) { sd_err("failed to write time to epoch %" PRIu32 " log", epoch); close(fd); return -1; } close(fd); return 0; } static int migrate_from_v1_to_v2(void) { int fd, ret; uint16_t version = 2; char store[STORE_LEN] = "plain"; /* we have only the plain driver */ fd = open(config_path, O_WRONLY | O_DSYNC); if (fd < 0) { sd_err("failed to open config file, %m"); return -1; } ret = xpwrite(fd, &version, sizeof(version), offsetof(struct sheepdog_config_v2, version)); if (ret != sizeof(version)) { sd_err("failed to write config data, %m"); close(fd); return -1; } ret = xpwrite(fd, store, sizeof(store), offsetof(struct sheepdog_config_v2, store)); if (ret != sizeof(store)) { sd_err("failed to write config data, %m"); close(fd); return -1; } close(fd); /* upgrade epoch log */ for_each_epoch(update_epoch_from_v1_to_v2); return ret; } static int (*migrate[])(void) = { migrate_from_v0_to_v1, /* from 0.4.0 or 0.5.0 to 0.5.1 */ migrate_from_v1_to_v2, /* from 0.5.x to 0.6.0 */ }; int sd_migrate_store(int from, int to) { int ver, ret; assert(to <= sizeof(migrate)); ret = backup_store(); if (ret != 0) { sd_err("failed to backup the old store"); return ret; } for (ver = from; ver < to; ver++) { ret = migrate[ver](); if (ret < 0) return ret; } /* success */ return 0; } sheepdog-0.7.5/sheep/object_cache.c000066400000000000000000001035621223630776600172050ustar00rootroot00000000000000/* * Copyright (C) 2012 Taobao Inc. * * Liu Yuan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" /* * Object Cache ID * * 0 - 19 (20 bits): data object space * 20 - 27 (8 bits): object flag space * 28 - 31 (4 bits): object type indentifier space */ #define CACHE_VDI_SHIFT 31 /* if the entry is identified as VDI object */ #define CACHE_CREATE_SHIFT 27 /* If the entry should be created at backend */ #define CACHE_VDI_BIT (UINT32_C(1) << CACHE_VDI_SHIFT) #define CACHE_CREATE_BIT (UINT32_C(1) << CACHE_CREATE_SHIFT) #define CACHE_INDEX_MASK (CACHE_CREATE_BIT) #define CACHE_OBJECT_SIZE (SD_DATA_OBJ_SIZE / 1024 / 1024) /* M */ /* Kick background pusher if dirty_count greater than it */ #define MAX_DIRTY_OBJECT_COUNT 10 /* Just a random number, no rationale */ struct global_cache { uint32_t capacity; /* The real capacity of object cache of this node */ uatomic_bool in_reclaim; /* If the relcaimer is working */ }; struct object_cache_entry { uint32_t idx; /* Index of this entry */ refcnt_t refcnt; /* Reference count of this entry */ uint64_t bmap; /* Each bit represents one dirty block in object */ struct object_cache *oc; /* Object cache this entry belongs to */ struct rb_node node; /* For lru tree of object cache */ struct list_head dirty_list; /* For dirty list of object cache */ struct list_head lru_list; /* For lru list of object cache */ struct sd_lock lock; /* Entry lock */ }; struct object_cache { uint32_t vid; /* The VID of this VDI */ uint32_t push_count; /* How many push threads queued in push phase. */ uint32_t dirty_count; /* How many dirty object in this cache */ uint32_t total_count; /* Count of objects include dirty and clean */ struct hlist_node hash; /* VDI is linked to the global hash lists */ struct rb_root lru_tree; /* For faster object search */ struct list_head lru_head; /* Per VDI LRU list for reclaimer */ struct list_head dirty_head; /* Dirty objects linked to this list */ int push_efd; /* Used to synchronize between pusher and push threads */ uatomic_bool in_push; /* Whether if pusher is running */ struct sd_lock lock; /* Cache lock */ }; struct push_work { struct work work; struct object_cache_entry *entry; struct object_cache *oc; }; static struct global_cache gcache; static char object_cache_dir[PATH_MAX]; static int def_open_flags = O_RDWR; #define HASH_BITS 5 #define HASH_SIZE (1 << HASH_BITS) static struct sd_lock hashtable_lock[HASH_SIZE] = { [0 ... HASH_SIZE - 1] = SD_LOCK_INITIALIZER }; static struct hlist_head cache_hashtable[HASH_SIZE]; static int object_cache_push(struct object_cache *oc); static inline bool entry_is_dirty(const struct object_cache_entry *entry) { return !!entry->bmap; } static inline int hash(uint64_t vid) { return hash_64(vid, HASH_BITS); } /* We should always use this helper to get entry idx */ static inline uint32_t entry_idx(const struct object_cache_entry *entry) { return entry->idx & ~CACHE_INDEX_MASK; } static inline uint32_t object_cache_oid_to_idx(uint64_t oid) { uint32_t idx = data_oid_to_idx(oid); if (is_vdi_obj(oid)) idx |= 1 << CACHE_VDI_SHIFT; return idx; } static inline bool idx_has_vdi_bit(uint32_t idx) { return !!(idx & CACHE_VDI_BIT); } static inline size_t get_cache_block_size(uint64_t oid) { size_t bsize = DIV_ROUND_UP(get_objsize(oid), sizeof(uint64_t) * BITS_PER_BYTE); return round_up(bsize, BLOCK_SIZE); /* To be FS friendly */ } static uint64_t calc_object_bmap(uint64_t oid, size_t len, off_t offset) { int start, end, nr; uint64_t bmap = 0; size_t bsize = get_cache_block_size(oid); start = offset / bsize; end = DIV_ROUND_UP(len + offset, bsize); nr = end - start; while (nr--) set_bit(start + nr, &bmap); return bmap; } static inline void get_cache_entry(struct object_cache_entry *entry) { refcount_inc(&entry->refcnt); } static inline void put_cache_entry(struct object_cache_entry *entry) { refcount_dec(&entry->refcnt); } static inline bool entry_in_use(struct object_cache_entry *entry) { return refcount_read(&entry->refcnt) > 0; } /* * Mutual exclusive protection strategy: * * reader and writer: no need to project since it is okay to read * unacked stale data. * reader, writer and pusher: cache lock and entry lock and refcnt. * reader, writer and reclaimer: cache lock and entry refcnt. * pusher and reclaimer: cache lock and entry refcnt. * * entry->bmap is projected by mostly entry lock, sometimes cache lock. * dirty list is projected by cache lock. */ static inline void read_lock_cache(struct object_cache *oc) { sd_read_lock(&oc->lock); } static inline void write_lock_cache(struct object_cache *oc) { sd_write_lock(&oc->lock); } static inline void unlock_cache(struct object_cache *oc) { sd_unlock(&oc->lock); } static inline void read_lock_entry(struct object_cache_entry *entry) { sd_read_lock(&entry->lock); } static inline void write_lock_entry(struct object_cache_entry *entry) { sd_write_lock(&entry->lock); } static inline void unlock_entry(struct object_cache_entry *entry) { sd_unlock(&entry->lock); } static struct object_cache_entry * lru_tree_insert(struct rb_root *root, struct object_cache_entry *new) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct object_cache_entry *entry; uint32_t idx = entry_idx(new); while (*p) { parent = *p; entry = rb_entry(parent, struct object_cache_entry, node); if (idx < entry_idx(entry)) p = &(*p)->rb_left; else if (idx > entry_idx(entry)) p = &(*p)->rb_right; else { /* already has this entry */ return entry; } } rb_link_node(&new->node, parent, p); rb_insert_color(&new->node, root); return NULL; /* insert successfully */ } static struct object_cache_entry *lru_tree_search(struct rb_root *root, uint32_t idx) { struct rb_node *n = root->rb_node; struct object_cache_entry *t; while (n) { t = rb_entry(n, struct object_cache_entry, node); if (idx < entry_idx(t)) n = n->rb_left; else if (idx > entry_idx(t)) n = n->rb_right; else return t; /* found it */ } return NULL; } static void do_background_push(struct work *work) { struct push_work *pw = container_of(work, struct push_work, work); struct object_cache *oc = pw->oc; if (!uatomic_set_true(&oc->in_push)) return; object_cache_push(oc); uatomic_set_false(&oc->in_push); } static void background_push_done(struct work *work) { struct push_work *pw = container_of(work, struct push_work, work); free(pw); } static void kick_background_pusher(struct object_cache *oc) { struct push_work *pw; pw = xzalloc(sizeof(struct push_work)); pw->oc = oc; pw->work.fn = do_background_push; pw->work.done = background_push_done; queue_work(sys->oc_push_wqueue, &pw->work); } static void del_from_dirty_list(struct object_cache_entry *entry) { struct object_cache *oc = entry->oc; list_del_init(&entry->dirty_list); uatomic_dec(&oc->dirty_count); } static void add_to_dirty_list(struct object_cache_entry *entry) { struct object_cache *oc = entry->oc; list_add_tail(&entry->dirty_list, &oc->dirty_head); /* FIXME read sys->status atomically */ if (uatomic_add_return(&oc->dirty_count, 1) > MAX_DIRTY_OBJECT_COUNT && !uatomic_is_true(&oc->in_push) && sys->cinfo.status == SD_STATUS_OK) kick_background_pusher(oc); } static inline void free_cache_entry(struct object_cache_entry *entry) { struct object_cache *oc = entry->oc; rb_erase(&entry->node, &oc->lru_tree); list_del_init(&entry->lru_list); oc->total_count--; if (!list_empty(&entry->dirty_list)) del_from_dirty_list(entry); sd_destroy_lock(&entry->lock); free(entry); } static uint64_t idx_to_oid(uint32_t vid, uint32_t idx) { if (idx_has_vdi_bit(idx)) return vid_to_vdi_oid(vid); else return vid_to_data_oid(vid, idx); } static int remove_cache_object(struct object_cache *oc, uint32_t idx) { int ret = SD_RES_SUCCESS; char path[PATH_MAX]; snprintf(path, sizeof(path), "%s/%06"PRIx32"/%08"PRIx32, object_cache_dir, oc->vid, idx); sd_debug("%"PRIx64, idx_to_oid(oc->vid, idx)); if (unlikely(unlink(path) < 0)) { sd_err("failed to remove cached object %m"); if (errno == ENOENT) return SD_RES_SUCCESS; ret = SD_RES_EIO; goto out; } out: return ret; } static int read_cache_object_noupdate(uint32_t vid, uint32_t idx, void *buf, size_t count, off_t offset) { size_t size; int fd, flags = def_open_flags, ret = SD_RES_SUCCESS; char p[PATH_MAX]; snprintf(p, sizeof(p), "%s/%06"PRIx32"/%08"PRIx32, object_cache_dir, vid, idx); if (sys->object_cache_directio && !idx_has_vdi_bit(idx)) { assert(is_aligned_to_pagesize(buf)); flags |= O_DIRECT; } fd = open(p, flags, sd_def_fmode); if (unlikely(fd < 0)) { sd_err("%m"); ret = SD_RES_EIO; goto out; } size = xpread(fd, buf, count, offset); if (unlikely(size != count)) { sd_err("size %zu, count:%zu, offset %jd %m", size, count, (intmax_t)offset); ret = SD_RES_EIO; goto out_close; } out_close: close(fd); out: return ret; } static int write_cache_object_noupdate(uint32_t vid, uint32_t idx, void *buf, size_t count, off_t offset) { size_t size; int fd, flags = def_open_flags, ret = SD_RES_SUCCESS; char p[PATH_MAX]; snprintf(p, sizeof(p), "%s/%06"PRIx32"/%08"PRIx32, object_cache_dir, vid, idx); if (sys->object_cache_directio && !idx_has_vdi_bit(idx)) { assert(is_aligned_to_pagesize(buf)); flags |= O_DIRECT; } fd = open(p, flags, sd_def_fmode); if (unlikely(fd < 0)) { sd_err("%m"); ret = SD_RES_EIO; goto out; } size = xpwrite(fd, buf, count, offset); if (unlikely(size != count)) { sd_err("size %zu, count:%zu, offset %jd %m", size, count, (intmax_t)offset); ret = SD_RES_EIO; goto out_close; } out_close: close(fd); out: return ret; } static int read_cache_object(struct object_cache_entry *entry, void *buf, size_t count, off_t offset) { uint32_t vid = entry->oc->vid, idx = entry_idx(entry); struct object_cache *oc = entry->oc; int ret; ret = read_cache_object_noupdate(vid, idx, buf, count, offset); if (ret == SD_RES_SUCCESS) { write_lock_cache(oc); list_move_tail(&entry->lru_list, &oc->lru_head); unlock_cache(oc); } return ret; } static int write_cache_object(struct object_cache_entry *entry, void *buf, size_t count, off_t offset, bool create, bool writeback) { uint32_t vid = entry->oc->vid, idx = entry_idx(entry); uint64_t oid = idx_to_oid(vid, idx); struct object_cache *oc = entry->oc; struct sd_req hdr; int ret; write_lock_entry(entry); ret = write_cache_object_noupdate(vid, idx, buf, count, offset); if (ret != SD_RES_SUCCESS) { unlock_entry(entry); return ret; } write_lock_cache(oc); if (writeback) { entry->bmap |= calc_object_bmap(oid, count, offset); if (list_empty(&entry->dirty_list)) add_to_dirty_list(entry); } list_move_tail(&entry->lru_list, &oc->lru_head); unlock_cache(oc); unlock_entry(entry); if (writeback) goto out; if (create) sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); else sd_init_req(&hdr, SD_OP_WRITE_OBJ); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = count; hdr.obj.oid = oid; hdr.obj.offset = offset; ret = exec_local_req(&hdr, buf); if (ret != SD_RES_SUCCESS) { sd_err("failed to write object %" PRIx64 ", %s", oid, sd_strerror(ret)); return ret; } out: return ret; } static int push_cache_object(uint32_t vid, uint32_t idx, uint64_t bmap, bool create) { struct sd_req hdr; void *buf; off_t offset; uint64_t oid = idx_to_oid(vid, idx); size_t data_length, bsize = get_cache_block_size(oid); int ret = SD_RES_NO_MEM; int first_bit, last_bit; if (!bmap) { sd_debug("WARN: nothing to flush %"PRIx64, oid); return SD_RES_SUCCESS; } first_bit = ffsll(bmap) - 1; last_bit = fls64(bmap) - 1; sd_debug("%"PRIx64" bmap(%zd):0x%"PRIx64", first_bit:%d, last_bit:%d", oid, bsize, bmap, first_bit, last_bit); offset = first_bit * bsize; data_length = min((last_bit - first_bit + 1) * bsize, get_objsize(oid) - offset); buf = xvalloc(data_length); ret = read_cache_object_noupdate(vid, idx, buf, data_length, offset); if (ret != SD_RES_SUCCESS) goto out; if (create) sd_init_req(&hdr, SD_OP_CREATE_AND_WRITE_OBJ); else sd_init_req(&hdr, SD_OP_WRITE_OBJ); hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = data_length; hdr.obj.oid = oid; hdr.obj.offset = offset; ret = exec_local_req(&hdr, buf); if (ret != SD_RES_SUCCESS) sd_err("failed to push object %" PRIx64 ", %s", oid, sd_strerror(ret)); out: free(buf); return ret; } /* * The reclaim algorithm is similar to Linux kernel's page cache: * - only tries to reclaim 'clean' object, which doesn't has any dirty updates, * in a LRU list. * - skip the object when it is in R/W operation. * - skip the dirty object if it is not in push(writeback) phase. * - wait on the dirty object if it is in push phase. */ /* * 90% is targeted for a large cache quota such as 200G, then we have 20G * buffer which is large enough to prevent cache overrun. */ #define HIGH_WATERMARK (sys->object_cache_size * 9 / 10) static void do_reclaim_object(struct object_cache *oc) { struct object_cache_entry *entry, *t; uint64_t oid; uint32_t cap; write_lock_cache(oc); list_for_each_entry_safe(entry, t, &oc->lru_head, lru_list) { oid = idx_to_oid(oc->vid, entry_idx(entry)); if (entry_in_use(entry)) { sd_debug("%"PRIx64" is in use, skip...", oid); continue; } /* * The shared snapshot objects won't be released after being * pulled and if sheep restarts, the remaining snapshot objects * will be marked as dirty. So for these kind of objects, we * can reclaim them safely. */ if (entry_is_dirty(entry) && !oid_is_readonly(oid)) { sd_debug("%"PRIx64" is dirty, skip...", oid); continue; } if (remove_cache_object(oc, entry_idx(entry)) != SD_RES_SUCCESS) continue; free_cache_entry(entry); cap = uatomic_sub_return(&gcache.capacity, CACHE_OBJECT_SIZE); sd_debug("%"PRIx64" reclaimed. capacity:%"PRId32, oid, cap); if (cap <= HIGH_WATERMARK) break; } unlock_cache(oc); } struct reclaim_work { struct work work; int delay; }; static void do_reclaim(struct work *work) { struct reclaim_work *rw = container_of(work, struct reclaim_work, work); struct object_cache *cache; struct hlist_node *node; int i, j; if (rw->delay) sleep(rw->delay); /* We choose a random victim to avoid reclaim the same one every time */ j = random(); for (i = 0; i < HASH_SIZE; i++) { int idx = (i + j) % HASH_SIZE; struct hlist_head *head = cache_hashtable + idx; sd_read_lock(&hashtable_lock[idx]); hlist_for_each_entry(cache, node, head, hash) { uint32_t cap; do_reclaim_object(cache); cap = uatomic_read(&gcache.capacity); if (cap <= HIGH_WATERMARK) { sd_unlock(&hashtable_lock[idx]); sd_debug("complete, capacity %"PRIu32, cap); return; } } sd_unlock(&hashtable_lock[idx]); } sd_debug("finished"); } static void reclaim_done(struct work *work) { struct reclaim_work *rw = container_of(work, struct reclaim_work, work); uatomic_set_false(&gcache.in_reclaim); free(rw); } static int create_dir_for(uint32_t vid) { int ret = 0; char p[PATH_MAX]; snprintf(p, sizeof(p), "%s/%06"PRIx32, object_cache_dir, vid); if (xmkdir(p, sd_def_dmode) < 0) { sd_err("%s, %m", p); ret = -1; } return ret; } static struct object_cache *find_object_cache(uint32_t vid, bool create) { int h = hash(vid); struct hlist_head *head = cache_hashtable + h; struct object_cache *cache = NULL; struct hlist_node *node; if (create) sd_write_lock(&hashtable_lock[h]); else sd_read_lock(&hashtable_lock[h]); if (hlist_empty(head)) goto not_found; hlist_for_each_entry(cache, node, head, hash) { if (cache->vid == vid) goto out; } not_found: if (create) { cache = xzalloc(sizeof(*cache)); cache->vid = vid; INIT_RB_ROOT(&cache->lru_tree); create_dir_for(vid); cache->push_efd = eventfd(0, 0); INIT_LIST_HEAD(&cache->dirty_head); INIT_LIST_HEAD(&cache->lru_head); sd_init_lock(&cache->lock); hlist_add_head(&cache->hash, head); } else { cache = NULL; } out: sd_unlock(&hashtable_lock[h]); return cache; } void object_cache_try_to_reclaim(int delay) { struct reclaim_work *rw; if (!sys->object_cache_size) return; if (uatomic_read(&gcache.capacity) < HIGH_WATERMARK) return; if (!uatomic_set_true(&gcache.in_reclaim)) /* the cache is already in reclaim, */ return; rw = xzalloc(sizeof(struct reclaim_work)); rw->delay = delay; rw->work.fn = do_reclaim; rw->work.done = reclaim_done; queue_work(sys->oc_reclaim_wqueue, &rw->work); } static inline struct object_cache_entry * alloc_cache_entry(struct object_cache *oc, uint32_t idx) { struct object_cache_entry *entry; entry = xzalloc(sizeof(*entry)); entry->oc = oc; entry->idx = idx; sd_init_lock(&entry->lock); INIT_LIST_HEAD(&entry->dirty_list); INIT_LIST_HEAD(&entry->lru_list); return entry; } static void add_to_lru_cache(struct object_cache *oc, uint32_t idx, bool create) { struct object_cache_entry *entry = alloc_cache_entry(oc, idx); sd_debug("oid %"PRIx64" added", idx_to_oid(oc->vid, idx)); write_lock_cache(oc); if (unlikely(lru_tree_insert(&oc->lru_tree, entry))) panic("the object already exist"); uatomic_add(&gcache.capacity, CACHE_OBJECT_SIZE); list_add_tail(&entry->lru_list, &oc->lru_head); oc->total_count++; if (create) { /* Cache lock assure it is not raced with pusher */ entry->bmap = UINT64_MAX; entry->idx |= CACHE_CREATE_BIT; add_to_dirty_list(entry); } unlock_cache(oc); } static inline int lookup_path(char *path) { int ret = SD_RES_SUCCESS; if (access(path, R_OK | W_OK) < 0) { if (unlikely(errno != ENOENT)) { sd_debug("%m"); ret = SD_RES_EIO; } else { ret = SD_RES_NO_CACHE; } } return ret; } static int object_cache_lookup(struct object_cache *oc, uint32_t idx, bool create, bool writeback) { int fd, ret, flags = def_open_flags; char path[PATH_MAX]; snprintf(path, sizeof(path), "%s/%06"PRIx32"/%08"PRIx32, object_cache_dir, oc->vid, idx); if (!create) return lookup_path(path); flags |= O_CREAT | O_TRUNC; fd = open(path, flags, sd_def_fmode); if (unlikely(fd < 0)) { sd_debug("%s, %m", path); ret = SD_RES_EIO; goto out; } ret = prealloc(fd, get_objsize(idx_to_oid(oc->vid, idx))); if (unlikely(ret < 0)) { ret = SD_RES_EIO; goto out_close; } add_to_lru_cache(oc, idx, writeback); object_cache_try_to_reclaim(0); out_close: close(fd); out: return ret; } static int create_cache_object(struct object_cache *oc, uint32_t idx, void *buffer, size_t buf_size, off_t offset, size_t obj_size) { int flags = def_open_flags | O_CREAT | O_EXCL, fd; int ret = SD_RES_OID_EXIST; char path[PATH_MAX], tmp_path[PATH_MAX]; snprintf(tmp_path, sizeof(tmp_path), "%s/%06"PRIx32"/%08"PRIx32".tmp", object_cache_dir, oc->vid, idx); fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (likely(errno == EEXIST)) { sd_debug("%08"PRIx32" already created", idx); goto out; } sd_debug("%m"); ret = SD_RES_EIO; goto out; } /* We need to extend it if the buffer is trimmed */ if (offset != 0 || buf_size != obj_size) { ret = prealloc(fd, obj_size); if (unlikely(ret < 0)) { ret = SD_RES_EIO; sd_err("%m"); goto out_close; } } ret = xpwrite(fd, buffer, buf_size, offset); if (unlikely(ret != buf_size)) { ret = SD_RES_EIO; sd_err("failed, vid %"PRIx32", idx %"PRIx32, oc->vid, idx); goto out_close; } /* This is intended to take care of partial write due to crash */ snprintf(path, sizeof(path), "%s/%06"PRIx32"/%08"PRIx32, object_cache_dir, oc->vid, idx); ret = link(tmp_path, path); if (unlikely(ret < 0)) { if (errno == EEXIST) { ret = SD_RES_OID_EXIST; goto out_close; } sd_debug("failed to link %s to %s: %m", tmp_path, path); /* FIXME: teach object cache handle EIO gracefully */ ret = SD_RES_EIO; goto out_close; } ret = SD_RES_SUCCESS; sd_debug("%08"PRIx32" size %zu", idx, obj_size); out_close: close(fd); unlink(tmp_path); out: return ret; } /* Fetch the object, cache it in the clean state */ static int object_cache_pull(struct object_cache *oc, uint32_t idx) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; int ret = SD_RES_NO_MEM; uint64_t oid = idx_to_oid(oc->vid, idx); uint32_t data_length = get_objsize(oid); void *buf; buf = xvalloc(data_length); sd_init_req(&hdr, SD_OP_READ_OBJ); hdr.data_length = data_length; hdr.obj.oid = oid; hdr.obj.offset = 0; ret = exec_local_req(&hdr, buf); if (ret != SD_RES_SUCCESS) goto err; sd_debug("oid %"PRIx64" pulled successfully", oid); ret = create_cache_object(oc, idx, buf, rsp->data_length, rsp->obj.offset, data_length); /* * We try to delay reclaim objects to avoid object ping-pong * because the pulled object is clean and likely to be reclaimed * in a cache over high watermark. We can't simply pass without * waking up reclaimer because the cache is easy to be filled * full with a read storm. */ switch (ret) { case SD_RES_SUCCESS: add_to_lru_cache(oc, idx, false); object_cache_try_to_reclaim(1); break; case SD_RES_OID_EXIST: ret = SD_RES_SUCCESS; break; default: break; } err: free(buf); return ret; } static void do_push_object(struct work *work) { struct push_work *pw = container_of(work, struct push_work, work); struct object_cache_entry *entry = pw->entry; struct object_cache *oc = entry->oc; uint64_t oid = idx_to_oid(oc->vid, entry_idx(entry)); sd_debug("%"PRIx64, oid); read_lock_entry(entry); /* * We might happen to push readonly object in following scenario * 1. sheep pulled some read-only objects * 2. sheep crashed * 3. sheep restarted and marked all the objects in cache dirty blindly */ if (oid_is_readonly(idx_to_oid(oc->vid, entry_idx(entry)))) goto clean; if (unlikely(push_cache_object(oc->vid, entry_idx(entry), entry->bmap, !!(entry->idx & CACHE_CREATE_BIT)) != SD_RES_SUCCESS)) panic("push failed but should never fail"); clean: if (uatomic_sub_return(&oc->push_count, 1) == 0) eventfd_xwrite(oc->push_efd, 1); entry->idx &= ~CACHE_CREATE_BIT; entry->bmap = 0; unlock_entry(entry); sd_debug("%"PRIx64" done", oid); put_cache_entry(entry); } static void push_object_done(struct work *work) { struct push_work *pw = container_of(work, struct push_work, work); free(pw); } /* * Push back all the dirty objects before the FLUSH request to sheep replicated * storage synchronously. * * 1. Don't grab cache lock tight so we can serve RW requests while pushing. * It is okay for allow subsequent RW after FLUSH because we only need to * garantee the dirty objects before FLUSH to be pushed. * 2. Use threaded AIO to boost push performance, such as fsync(2) from VM. */ static int object_cache_push(struct object_cache *oc) { struct object_cache_entry *entry, *t; write_lock_cache(oc); if (list_empty(&oc->dirty_head)) { unlock_cache(oc); return SD_RES_SUCCESS; } uatomic_set(&oc->push_count, uatomic_read(&oc->dirty_count)); list_for_each_entry_safe(entry, t, &oc->dirty_head, dirty_list) { struct push_work *pw; get_cache_entry(entry); pw = xzalloc(sizeof(struct push_work)); pw->work.fn = do_push_object; pw->work.done = push_object_done; pw->entry = entry; queue_work(sys->oc_push_wqueue, &pw->work); del_from_dirty_list(entry); } unlock_cache(oc); eventfd_xread(oc->push_efd); sd_debug("%"PRIx32" completed", oc->vid); return SD_RES_SUCCESS; } bool object_is_cached(uint64_t oid) { uint32_t vid = oid_to_vid(oid); uint32_t idx = object_cache_oid_to_idx(oid); struct object_cache *cache; cache = find_object_cache(vid, false); if (!cache) return false; return (object_cache_lookup(cache, idx, 0, false) == SD_RES_SUCCESS); } void object_cache_delete(uint32_t vid) { struct object_cache *cache; int h = hash(vid); struct object_cache_entry *entry, *t; char path[PATH_MAX]; cache = find_object_cache(vid, false); if (!cache) return; /* Firstly we free memeory */ sd_write_lock(&hashtable_lock[h]); hlist_del(&cache->hash); sd_unlock(&hashtable_lock[h]); write_lock_cache(cache); list_for_each_entry_safe(entry, t, &cache->lru_head, lru_list) { free_cache_entry(entry); uatomic_sub(&gcache.capacity, CACHE_OBJECT_SIZE); } unlock_cache(cache); sd_destroy_lock(&cache->lock); close(cache->push_efd); free(cache); /* Then we free disk */ snprintf(path, sizeof(path), "%s/%06"PRIx32, object_cache_dir, vid); rmdir_r(path); } static struct object_cache_entry * get_cache_entry_from(struct object_cache *cache, uint32_t idx) { struct object_cache_entry *entry; read_lock_cache(cache); entry = lru_tree_search(&cache->lru_tree, idx); if (!entry) { /* The cache entry may be reclaimed, so try again. */ unlock_cache(cache); return NULL; } get_cache_entry(entry); unlock_cache(cache); return entry; } /* This helper increases the refcount */ static struct object_cache_entry *oid_to_entry(uint64_t oid) { uint32_t vid = oid_to_vid(oid); uint32_t idx = object_cache_oid_to_idx(oid); struct object_cache *cache; struct object_cache_entry *entry; cache = find_object_cache(vid, false); entry = get_cache_entry_from(cache, idx); if (!entry) { sd_debug("%" PRIx64 " doesn't exist", oid); return NULL; } return entry; } static int object_cache_flush_and_delete(struct object_cache *oc) { DIR *dir; struct dirent *d; uint32_t vid = oc->vid; uint32_t idx; uint64_t all = UINT64_MAX; int ret = 0; char p[PATH_MAX]; sd_debug("%"PRIx32, vid); snprintf(p, sizeof(p), "%s/%06"PRIx32, object_cache_dir, vid); dir = opendir(p); if (!dir) { sd_debug("%m"); ret = -1; goto out; } while ((d = readdir(dir))) { if (!strncmp(d->d_name, ".", 1)) continue; if (strcmp(d->d_name + 8, ".tmp") == 0) { sd_debug("try to del %s", d->d_name); if (unlinkat(dirfd(dir), d->d_name, 0) < 0) sd_err("%m"); continue; } idx = strtoul(d->d_name, NULL, 16); if (idx == ULLONG_MAX) continue; if (push_cache_object(vid, idx, all, true) != SD_RES_SUCCESS) { ret = -1; goto out_close_dir; } } object_cache_delete(vid); out_close_dir: closedir(dir); out: return ret; } bool bypass_object_cache(const struct request *req) { uint64_t oid = req->rq.obj.oid; if (!sys->enable_object_cache || req->local) return true; /* For vmstate && vdi_attr object, we don't do caching */ if (is_vmstate_obj(oid) || is_vdi_attr_obj(oid) || req->rq.flags & SD_FLAG_CMD_COW) return true; if (req->rq.flags & SD_FLAG_CMD_DIRECT) { uint32_t vid = oid_to_vid(oid); struct object_cache *cache; cache = find_object_cache(vid, false); if (!cache) return true; if (req->rq.flags & SD_FLAG_CMD_WRITE) { object_cache_flush_and_delete(cache); return true; } else { /* For read requet, we can read cache if any */ uint32_t idx = object_cache_oid_to_idx(oid); if (object_cache_lookup(cache, idx, false, false) == 0) return false; else return true; } } return false; } int object_cache_handle_request(struct request *req) { struct sd_req *hdr = &req->rq; uint64_t oid = req->rq.obj.oid; uint32_t vid = oid_to_vid(oid); uint32_t idx = object_cache_oid_to_idx(oid); struct object_cache *cache; struct object_cache_entry *entry; int ret; bool create = false; sd_debug("%08" PRIx32 ", len %" PRIu32 ", off %" PRIu64, idx, hdr->data_length, hdr->obj.offset); cache = find_object_cache(vid, true); if (req->rq.opcode == SD_OP_CREATE_AND_WRITE_OBJ) create = true; retry: ret = object_cache_lookup(cache, idx, create, hdr->flags & SD_FLAG_CMD_CACHE); switch (ret) { case SD_RES_NO_CACHE: ret = object_cache_pull(cache, idx); if (ret != SD_RES_SUCCESS) return ret; break; case SD_RES_EIO: return ret; } entry = get_cache_entry_from(cache, idx); if (!entry) { sd_debug("retry oid %"PRIx64, oid); /* * For the case that object exists but isn't added to object * list yet, we call pthread_yield() to expect other thread can * add object to list ASAP. */ pthread_yield(); goto retry; } if (hdr->flags & SD_FLAG_CMD_WRITE) { ret = write_cache_object(entry, req->data, hdr->data_length, hdr->obj.offset, create, hdr->flags & SD_FLAG_CMD_CACHE); if (ret != SD_RES_SUCCESS) goto err; } else { ret = read_cache_object(entry, req->data, hdr->data_length, hdr->obj.offset); if (ret != SD_RES_SUCCESS) goto err; req->rp.data_length = hdr->data_length; } err: put_cache_entry(entry); return ret; } int object_cache_write(uint64_t oid, char *data, unsigned int datalen, uint64_t offset, bool create) { struct object_cache_entry *entry = oid_to_entry(oid); int ret; sd_debug("%" PRIx64, oid); if (!entry) { sd_debug("%" PRIx64 " doesn't exist", oid); return SD_RES_NO_CACHE; } ret = write_cache_object(entry, data, datalen, offset, create, false); put_cache_entry(entry); return ret; } int object_cache_read(uint64_t oid, char *data, unsigned int datalen, uint64_t offset) { struct object_cache_entry *entry = oid_to_entry(oid); int ret; sd_debug("%" PRIx64, oid); if (!entry) { sd_debug("%" PRIx64 " doesn't exist", oid); return SD_RES_NO_CACHE; } ret = read_cache_object(entry, data, datalen, offset); put_cache_entry(entry); return ret; } int object_cache_flush_vdi(uint32_t vid) { struct object_cache *cache; int ret; cache = find_object_cache(vid, false); if (!cache) { sd_debug("%"PRIx32" not found", vid); return SD_RES_SUCCESS; } /* * We have to wait for last pusher finishing and push again so * that dirty bits produced while it is waiting are guaranteed * to be pushed back */ while (!uatomic_set_true(&cache->in_push)) usleep(100000); ret = object_cache_push(cache); uatomic_set_false(&cache->in_push); return ret; } int object_cache_flush_and_del(const struct request *req) { uint32_t vid = oid_to_vid(req->rq.obj.oid); struct object_cache *cache; cache = find_object_cache(vid, false); if (cache && object_cache_flush_and_delete(cache) < 0) return SD_RES_EIO; return SD_RES_SUCCESS; } static int load_cache_object(struct object_cache *cache) { DIR *dir; struct dirent *d; uint32_t idx; char path[PATH_MAX]; int ret = 0; snprintf(path, sizeof(path), "%s/%06"PRIx32, object_cache_dir, cache->vid); dir = opendir(path); if (!dir) { sd_debug("%m"); ret = -1; goto out; } while ((d = readdir(dir))) { if (!strncmp(d->d_name, ".", 1)) continue; if (strcmp(d->d_name + 8, ".tmp") == 0) { sd_debug("try to del %s", d->d_name); if (unlinkat(dirfd(dir), d->d_name, 0) < 0) sd_err("%m"); continue; } idx = strtoul(d->d_name, NULL, 16); if (idx == ULLONG_MAX) continue; /* * We don't know VM's cache type after restarting, so we assume * that it is writeback and mark all the objects diry to avoid * false reclaim. Donot try to reclaim at loading phase becaue * cluster isn't fully working. */ add_to_lru_cache(cache, idx, true); sd_debug("%"PRIx64, idx_to_oid(cache->vid, idx)); } closedir(dir); out: return ret; } static int load_cache(void) { DIR *dir; struct dirent *d; uint32_t vid; char path[PATH_MAX]; int ret = 0; snprintf(path, sizeof(path), "%s", object_cache_dir); dir = opendir(path); if (!dir) { sd_debug("%m"); ret = -1; goto out; } while ((d = readdir(dir))) { if (!strncmp(d->d_name, ".", 1)) continue; vid = strtoul(d->d_name, NULL, 16); if (vid == ULLONG_MAX) continue; load_cache_object(find_object_cache(vid, true)); } closedir(dir); out: return ret; } int object_cache_remove(uint64_t oid) { /* Inc the entry refcount to exclude the reclaimer */ struct object_cache_entry *entry = oid_to_entry(oid); struct object_cache *oc = entry->oc; int ret; if (!entry) return SD_RES_NO_OBJ; sd_debug("%" PRIx64, oid); while (refcount_read(&entry->refcnt) > 1) usleep(100000); /* Object might be in push */ write_lock_cache(oc); /* * We assume no other thread will inc the refcount of this entry * before we call write_lock_cache(). object_cache_remove() is called * in the DISCARD context, which means nornamly no other read/write * requests. */ assert(refcount_read(&entry->refcnt) == 1); ret = remove_cache_object(oc, entry_idx(entry)); if (ret != SD_RES_SUCCESS) { unlock_cache(oc); return ret; } free_cache_entry(entry); unlock_cache(oc); uatomic_sub(&gcache.capacity, CACHE_OBJECT_SIZE); return SD_RES_SUCCESS; } int object_cache_init(const char *p) { int ret = 0; struct strbuf buf = STRBUF_INIT; strbuf_addstr(&buf, p); if (xmkdir(buf.buf, sd_def_dmode) < 0) { sd_err("%s %m", buf.buf); ret = -1; goto err; } strbuf_addstr(&buf, "/cache"); if (xmkdir(buf.buf, sd_def_dmode) < 0) { sd_err("%s %m", buf.buf); ret = -1; goto err; } strbuf_copyout(&buf, object_cache_dir, sizeof(object_cache_dir)); uatomic_set(&gcache.capacity, 0); uatomic_set_false(&gcache.in_reclaim); ret = load_cache(); err: strbuf_release(&buf); return ret; } void object_cache_format(void) { struct object_cache *cache; struct hlist_node *node, *t; int i; for (i = 0; i < HASH_SIZE; i++) { struct hlist_head *head = cache_hashtable + i; hlist_for_each_entry_safe(cache, node, t, head, hash) { object_cache_delete(cache->vid); } } uatomic_set(&gcache.capacity, 0); } int object_cache_get_info(struct object_cache_info *info) { int j = 0; info->used = gcache.capacity * 1024 * 1024; info->size = sys->object_cache_size * 1024 * 1024; for (int i = 0; i < HASH_SIZE; i++) { struct hlist_head *head = cache_hashtable + i; struct object_cache *cache; struct hlist_node *node; sd_read_lock(&hashtable_lock[i]); hlist_for_each_entry(cache, node, head, hash) { read_lock_cache(cache); info->caches[j].vid = cache->vid; info->caches[j].dirty = cache->dirty_count; info->caches[j].total = cache->total_count; j++; unlock_cache(cache); } sd_unlock(&hashtable_lock[i]); } info->count = j; return sizeof(*info); } sheepdog-0.7.5/sheep/object_list_cache.c000066400000000000000000000134471223630776600202420ustar00rootroot00000000000000/* * Copyright (C) 2012 Taobao Inc. * * Levin Li * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" struct objlist_cache_entry { uint64_t oid; struct list_head list; struct rb_node node; }; struct objlist_cache { int tree_version; int buf_version; int cache_size; uint64_t *buf; struct list_head entry_list; struct rb_root root; struct sd_lock lock; }; struct objlist_deletion_work { uint32_t vid; struct work work; }; static struct objlist_cache obj_list_cache = { .tree_version = 1, .root = RB_ROOT, .entry_list = LIST_HEAD_INIT(obj_list_cache.entry_list), .lock = SD_LOCK_INITIALIZER, }; static struct objlist_cache_entry *objlist_cache_rb_insert(struct rb_root *root, struct objlist_cache_entry *new) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct objlist_cache_entry *entry; while (*p) { parent = *p; entry = rb_entry(parent, struct objlist_cache_entry, node); if (new->oid < entry->oid) p = &(*p)->rb_left; else if (new->oid > entry->oid) p = &(*p)->rb_right; else return entry; /* already has this entry */ } rb_link_node(&new->node, parent, p); rb_insert_color(&new->node, root); return NULL; /* insert successfully */ } static int objlist_cache_rb_remove(struct rb_root *root, uint64_t oid) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct objlist_cache_entry *entry; while (*p) { parent = *p; entry = rb_entry(parent, struct objlist_cache_entry, node); if (oid < entry->oid) p = &(*p)->rb_left; else if (oid > entry->oid) p = &(*p)->rb_right; else { list_del(&entry->list); rb_erase(parent, root); free(entry); return 0; } } return -1; /* fail to remove */ } void objlist_cache_remove(uint64_t oid) { sd_write_lock(&obj_list_cache.lock); if (!objlist_cache_rb_remove(&obj_list_cache.root, oid)) { obj_list_cache.cache_size--; obj_list_cache.tree_version++; } sd_unlock(&obj_list_cache.lock); } int objlist_cache_insert(uint64_t oid) { struct objlist_cache_entry *entry, *p; entry = xzalloc(sizeof(*entry)); entry->oid = oid; rb_init_node(&entry->node); sd_write_lock(&obj_list_cache.lock); p = objlist_cache_rb_insert(&obj_list_cache.root, entry); if (p) free(entry); else { list_add(&entry->list, &obj_list_cache.entry_list); obj_list_cache.cache_size++; obj_list_cache.tree_version++; } sd_unlock(&obj_list_cache.lock); return 0; } int get_obj_list(const struct sd_req *hdr, struct sd_rsp *rsp, void *data) { int nr = 0; struct objlist_cache_entry *entry; /* first try getting the cached buffer with only a read lock held */ sd_read_lock(&obj_list_cache.lock); if (obj_list_cache.tree_version == obj_list_cache.buf_version) goto out; /* if that fails grab a write lock for the usually nessecary update */ sd_unlock(&obj_list_cache.lock); sd_write_lock(&obj_list_cache.lock); if (obj_list_cache.tree_version == obj_list_cache.buf_version) goto out; obj_list_cache.buf_version = obj_list_cache.tree_version; obj_list_cache.buf = xrealloc(obj_list_cache.buf, obj_list_cache.cache_size * sizeof(uint64_t)); list_for_each_entry(entry, &obj_list_cache.entry_list, list) { obj_list_cache.buf[nr++] = entry->oid; } out: if (hdr->data_length < obj_list_cache.cache_size * sizeof(uint64_t)) { sd_unlock(&obj_list_cache.lock); sd_err("GET_OBJ_LIST buffer too small"); return SD_RES_BUFFER_SMALL; } rsp->data_length = obj_list_cache.cache_size * sizeof(uint64_t); memcpy(data, obj_list_cache.buf, rsp->data_length); sd_unlock(&obj_list_cache.lock); return SD_RES_SUCCESS; } static void objlist_deletion_work(struct work *work) { struct objlist_deletion_work *ow = container_of(work, struct objlist_deletion_work, work); struct objlist_cache_entry *entry, *t; uint32_t vid = ow->vid, entry_vid; /* * Before reclaiming the cache belonging to the VDI just deleted, * we should test whether the VDI is exist, because after some node * deleting it and before the notification is sent to all the node, * another node may issus a VDI creation event and reused the VDI id * again, in which case we should not reclaim the cached entry. */ if (vdi_exist(vid)) { sd_debug("VDI (%" PRIx32 ") is still in use, can not be" " deleted", vid); return; } sd_write_lock(&obj_list_cache.lock); list_for_each_entry_safe(entry, t, &obj_list_cache.entry_list, list) { entry_vid = oid_to_vid(entry->oid); if (entry_vid != vid) continue; /* VDI objects cannot be removed even after we delete images. */ if (is_vdi_obj(entry->oid)) continue; sd_debug("delete object entry %" PRIx64, entry->oid); list_del(&entry->list); rb_erase(&entry->node, &obj_list_cache.root); free(entry); } sd_unlock(&obj_list_cache.lock); } static void objlist_deletion_done(struct work *work) { struct objlist_deletion_work *ow = container_of(work, struct objlist_deletion_work, work); free(ow); } /* * During recovery, some objects may be migrated from one node to a * new one, but we can't remove the object list cache entry in this * case, it may causes recovery failure, so after recovery, we can * not locate the cache entry correctly, causing objlist_cache_remove() * fail to delete it, then we need this function to do the cleanup work * in all nodes. */ int objlist_cache_cleanup(uint32_t vid) { struct objlist_deletion_work *ow; ow = xzalloc(sizeof(*ow)); ow->vid = vid; ow->work.fn = objlist_deletion_work; ow->work.done = objlist_deletion_done; queue_work(sys->deletion_wqueue, &ow->work); return SD_RES_SUCCESS; } sheepdog-0.7.5/sheep/ops.c000066400000000000000000000747221223630776600154220ustar00rootroot00000000000000/* * Copyright (C) 2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" #include "trace/trace.h" enum sd_op_type { SD_OP_TYPE_CLUSTER = 1, /* cluster operations */ SD_OP_TYPE_LOCAL, /* local operations */ SD_OP_TYPE_PEER, /* io operations */ SD_OP_TYPE_GATEWAY, /* gateway operations */ }; struct sd_op_template { const char *name; enum sd_op_type type; /* process request even when cluster is not working */ bool force; /* * process_work() will be called in a worker thread, and process_main() * will be called in the main thread. * * If type is SD_OP_TYPE_CLUSTER, it is guaranteed that only one node * processes a cluster operation at the same time. We can use this for * for example to implement distributed locking. process_work() * will be called on the local node, and process_main() will be called * on every node. * * If type is SD_OP_TYPE_LOCAL, both process_work() and process_main() * will be called on the local node. * * If type is SD_OP_TYPE_PEER, only process_work() will be called, and it * will be called on the local node. */ int (*process_work)(struct request *req); int (*process_main)(const struct sd_req *req, struct sd_rsp *rsp, void *data); }; static int stat_sheep(uint64_t *store_size, uint64_t *store_free, uint32_t epoch) { uint64_t used; if (sys->gateway_only) { *store_size = 0; *store_free = 0; } else { *store_size = md_get_size(&used); *store_free = *store_size - used; } return SD_RES_SUCCESS; } static int cluster_new_vdi(struct request *req) { const struct sd_req *hdr = &req->rq; struct sd_rsp *rsp = &req->rp; uint32_t vid; int ret; struct vdi_iocb iocb = { .name = req->data, .data_len = hdr->data_length, .size = hdr->vdi.vdi_size, .base_vid = hdr->vdi.base_vdi_id, .create_snapshot = !!hdr->vdi.snapid, .nr_copies = hdr->vdi.copies ? hdr->vdi.copies : sys->cinfo.nr_copies, }; if (hdr->data_length != SD_MAX_VDI_LEN) return SD_RES_INVALID_PARMS; ret = vdi_create(&iocb, &vid); rsp->vdi.vdi_id = vid; rsp->vdi.copies = iocb.nr_copies; return ret; } static int post_cluster_new_vdi(const struct sd_req *req, struct sd_rsp *rsp, void *data) { unsigned long nr = rsp->vdi.vdi_id; int ret = rsp->result; sd_debug("done %d %lx", ret, nr); if (ret == SD_RES_SUCCESS) atomic_set_bit(nr, sys->vdi_inuse); return ret; } static int vdi_init_tag(const char **tag, const char *buf, uint32_t len) { if (len == SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN) *tag = buf + SD_MAX_VDI_LEN; else if (len == SD_MAX_VDI_LEN) *tag = NULL; else return -1; return 0; } static int cluster_del_vdi(struct request *req) { const struct sd_req *hdr = &req->rq; uint32_t data_len = hdr->data_length; struct vdi_iocb iocb = { .name = req->data, .data_len = data_len, .snapid = hdr->vdi.snapid, }; if (vdi_init_tag(&iocb.tag, req->data, data_len) < 0) return SD_RES_INVALID_PARMS; return vdi_delete(&iocb, req); } struct cache_deletion_work { uint32_t vid; struct work work; }; static void cache_delete_work(struct work *work) { struct cache_deletion_work *dw = container_of(work, struct cache_deletion_work, work); object_cache_delete(dw->vid); } static void cache_delete_done(struct work *work) { struct cache_deletion_work *dw = container_of(work, struct cache_deletion_work, work); free(dw); } static int post_cluster_del_vdi(const struct sd_req *req, struct sd_rsp *rsp, void *data) { unsigned long vid = rsp->vdi.vdi_id; struct cache_deletion_work *dw; int ret = rsp->result; if (!sys->enable_object_cache) return ret; dw = xzalloc(sizeof(*dw)); dw->vid = vid; dw->work.fn = cache_delete_work; dw->work.done = cache_delete_done; queue_work(sys->deletion_wqueue, &dw->work); return ret; } /* * Look up vid and copy number from vdi name * * This must be a cluster operation. If QEMU reads the vdi object * while sheep snapshots the vdi, sheep can return SD_RES_NO_VDI. To * avoid this problem, SD_OP_GET_INFO must be ordered with * SD_OP_NEW_VDI. */ static int cluster_get_vdi_info(struct request *req) { const struct sd_req *hdr = &req->rq; struct sd_rsp *rsp = &req->rp; uint32_t data_len = hdr->data_length; int ret; struct vdi_info info = {}; struct vdi_iocb iocb = { .name = req->data, .data_len = data_len, .snapid = hdr->vdi.snapid, }; if (vdi_init_tag(&iocb.tag, req->data, data_len) < 0) return SD_RES_INVALID_PARMS; ret = vdi_lookup(&iocb, &info); if (ret != SD_RES_SUCCESS) return ret; rsp->vdi.vdi_id = info.vid; rsp->vdi.copies = get_vdi_copy_number(info.vid); return ret; } static int remove_epoch(uint32_t epoch) { int ret; char path[PATH_MAX]; sd_debug("remove epoch %"PRIu32, epoch); snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch); ret = unlink(path); if (ret && ret != -ENOENT) { sd_err("failed to remove %s: %s", path, strerror(-ret)); return SD_RES_EIO; } return SD_RES_EIO; } static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp, void *data) { int i, ret; uint32_t latest_epoch; struct store_driver *driver; char *store_name = data; driver = find_store_driver(data); if (!driver) return SD_RES_NO_STORE; pstrcpy((char *)sys->cinfo.store, sizeof(sys->cinfo.store), store_name); sd_store = driver; latest_epoch = get_latest_epoch(); ret = sd_store->format(); if (ret != SD_RES_SUCCESS) return ret; ret = sd_store->init(); if (ret != SD_RES_SUCCESS) return ret; sys->cinfo.nr_copies = req->cluster.copies; sys->cinfo.flags = req->flags; if (!sys->cinfo.nr_copies) sys->cinfo.nr_copies = SD_DEFAULT_COPIES; sys->cinfo.ctime = req->cluster.ctime; set_cluster_config(&sys->cinfo); for (i = 1; i <= latest_epoch; i++) remove_epoch(i); memset(sys->vdi_inuse, 0, sizeof(sys->vdi_inuse)); clean_vdi_state(); sys->cinfo.epoch = 0; ret = inc_and_log_epoch(); if (ret) return SD_RES_EIO; sys->cinfo.status = SD_STATUS_OK; return SD_RES_SUCCESS; } static int cluster_shutdown(const struct sd_req *req, struct sd_rsp *rsp, void *data) { sys->cinfo.status = SD_STATUS_SHUTDOWN; return SD_RES_SUCCESS; } static int cluster_enable_recover(const struct sd_req *req, struct sd_rsp *rsp, void *data) { sys->cinfo.disable_recovery = false; resume_suspended_recovery(); return SD_RES_SUCCESS; } static int cluster_disable_recover(const struct sd_req *req, struct sd_rsp *rsp, void *data) { sys->cinfo.disable_recovery = true; return SD_RES_SUCCESS; } static int cluster_get_vdi_attr(struct request *req) { const struct sd_req *hdr = &req->rq; struct sd_rsp *rsp = &req->rp; uint32_t vid, attrid = 0; struct sheepdog_vdi_attr *vattr; struct vdi_iocb iocb = {}; struct vdi_info info = {}; int ret; vattr = req->data; iocb.name = vattr->name; iocb.tag = vattr->tag; iocb.snapid = hdr->vdi.snapid; ret = vdi_lookup(&iocb, &info); if (ret != SD_RES_SUCCESS) return ret; /* * the current VDI id can change if we take a snapshot, * so we use the hash value of the VDI name as the VDI id */ vid = fnv_64a_buf(vattr->name, strlen(vattr->name), FNV1A_64_INIT); vid &= SD_NR_VDIS - 1; ret = get_vdi_attr(req->data, hdr->data_length, vid, &attrid, info.create_time, !!(hdr->flags & SD_FLAG_CMD_CREAT), !!(hdr->flags & SD_FLAG_CMD_EXCL), !!(hdr->flags & SD_FLAG_CMD_DEL)); rsp->vdi.vdi_id = vid; rsp->vdi.attr_id = attrid; rsp->vdi.copies = get_vdi_copy_number(vid); return ret; } static int local_release_vdi(struct request *req) { uint32_t vid = req->rq.vdi.base_vdi_id; int ret; if (!sys->enable_object_cache) return SD_RES_SUCCESS; if (!vid) { sd_info("Some VDI failed to release the object cache. " "Probably you are running old QEMU."); return SD_RES_SUCCESS; } ret = object_cache_flush_vdi(vid); if (ret == SD_RES_SUCCESS) object_cache_delete(vid); return ret; } static int local_get_store_list(struct request *req) { struct strbuf buf = STRBUF_INIT; struct store_driver *driver; list_for_each_entry(driver, &store_drivers, list) { strbuf_addf(&buf, "%s ", driver->name); } req->rp.data_length = strbuf_copyout(&buf, req->data, req->data_length); strbuf_release(&buf); return SD_RES_SUCCESS; } static int local_read_vdis(const struct sd_req *req, struct sd_rsp *rsp, void *data) { return read_vdis(data, req->data_length, &rsp->data_length); } static int local_get_vdi_copies(const struct sd_req *req, struct sd_rsp *rsp, void *data) { rsp->data_length = fill_vdi_state_list(data); return SD_RES_SUCCESS; } static int local_stat_sheep(struct request *req) { struct sd_rsp *rsp = &req->rp; uint32_t epoch = req->rq.epoch; return stat_sheep(&rsp->node.store_size, &rsp->node.store_free, epoch); } static int local_stat_recovery(const struct sd_req *req, struct sd_rsp *rsp, void *data) { get_recovery_state(data); rsp->data_length = sizeof(struct recovery_state); return SD_RES_SUCCESS; } static int local_stat_cluster(struct request *req) { struct sd_rsp *rsp = &req->rp; struct epoch_log *elog; int i, max_elogs; uint32_t epoch; if (req->vinfo == NULL) { sd_debug("cluster is not started up"); goto out; } max_elogs = req->rq.data_length / sizeof(*elog); epoch = get_latest_epoch(); for (i = 0; i < max_elogs; i++) { size_t nr_nodes; if (epoch <= 0) break; elog = (struct epoch_log *)req->data + i; memset(elog, 0, sizeof(*elog)); elog->epoch = epoch; elog->ctime = sys->cinfo.ctime; nr_nodes = epoch_log_read_with_timestamp(epoch, elog->nodes, sizeof(elog->nodes), (time_t *)&elog->time); if (nr_nodes == -1) nr_nodes = epoch_log_read_remote(epoch, elog->nodes, sizeof(elog->nodes), (time_t *)&elog->time, req->vinfo); assert(nr_nodes >= 0); assert(nr_nodes <= SD_MAX_NODES); elog->nr_nodes = nr_nodes; elog->disable_recovery = sys->cinfo.disable_recovery; rsp->data_length += sizeof(*elog); epoch--; } out: switch (sys->cinfo.status) { case SD_STATUS_OK: return SD_RES_SUCCESS; case SD_STATUS_WAIT: if (sys->cinfo.ctime == 0) return SD_RES_WAIT_FOR_FORMAT; else return SD_RES_WAIT_FOR_JOIN; case SD_STATUS_SHUTDOWN: return SD_RES_SHUTDOWN; default: return SD_RES_SYSTEM_ERROR; } } static int local_get_obj_list(struct request *req) { return get_obj_list(&req->rq, &req->rp, req->data); } static int local_get_epoch(struct request *req) { uint32_t epoch = req->rq.obj.tgt_epoch; int nr_nodes, nodes_len; time_t timestamp; sd_debug("%d", epoch); nr_nodes = epoch_log_read_with_timestamp(epoch, req->data, req->rq.data_length - sizeof(timestamp), ×tamp); if (nr_nodes == -1) return SD_RES_NO_TAG; nodes_len = nr_nodes * sizeof(struct sd_node); memcpy((void *)((char *)req->data + nodes_len), ×tamp, sizeof(timestamp)); req->rp.data_length = nodes_len + sizeof(time_t); return SD_RES_SUCCESS; } static int cluster_force_recover_work(struct request *req) { struct vnode_info *old_vnode_info; uint32_t epoch = sys_epoch(); /* * We should manually recover the cluster when * 1) the master is physically down (different epoch condition). * 2) some nodes are physically down (same epoch condition). * In both case, the nodes(s) stat is WAIT_FOR_JOIN. */ if (sys->cinfo.status != SD_STATUS_WAIT || req->vinfo == NULL) return SD_RES_FORCE_RECOVER; old_vnode_info = get_vnode_info_epoch(epoch, req->vinfo); if (!old_vnode_info) { sd_emerg("cannot get vnode info for epoch %d", epoch); put_vnode_info(old_vnode_info); return SD_RES_FORCE_RECOVER; } if (req->rq.data_length < sizeof(*old_vnode_info->nodes) * old_vnode_info->nr_nodes) { sd_err("too small buffer size, %d", req->rq.data_length); return SD_RES_INVALID_PARMS; } req->rp.epoch = epoch; req->rp.data_length = sizeof(*old_vnode_info->nodes) * old_vnode_info->nr_nodes; memcpy(req->data, old_vnode_info->nodes, req->rp.data_length); put_vnode_info(old_vnode_info); return SD_RES_SUCCESS; } static int cluster_force_recover_main(const struct sd_req *req, struct sd_rsp *rsp, void *data) { struct vnode_info *old_vnode_info, *vnode_info; int ret = SD_RES_SUCCESS; struct sd_node *nodes = data; size_t nr_nodes = rsp->data_length / sizeof(*nodes); if (rsp->epoch != sys->cinfo.epoch) { sd_err("epoch was incremented while cluster_force_recover"); return SD_RES_FORCE_RECOVER; } ret = inc_and_log_epoch(); if (ret) { sd_emerg("cannot update epoch log"); goto err; } if (!is_cluster_formatted()) /* initialize config file */ set_cluster_config(&sys->cinfo); sys->cinfo.status = SD_STATUS_OK; vnode_info = get_vnode_info(); old_vnode_info = alloc_vnode_info(nodes, nr_nodes); start_recovery(vnode_info, old_vnode_info, true); put_vnode_info(vnode_info); put_vnode_info(old_vnode_info); return ret; err: panic("failed in force recovery"); } static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp, void *data) { int ret; if (node_in_recovery()) return SD_RES_NODE_IN_RECOVERY; if (sys->gateway_only) return SD_RES_SUCCESS; if (sd_store->cleanup) ret = sd_store->cleanup(); else ret = SD_RES_NO_SUPPORT; return ret; } static int cluster_notify_vdi_add(const struct sd_req *req, struct sd_rsp *rsp, void *data) { if (req->vdi_state.old_vid) /* make the previous working vdi a snapshot */ add_vdi_state(req->vdi_state.old_vid, get_vdi_copy_number(req->vdi_state.old_vid), true); if (req->vdi_state.set_bitmap) atomic_set_bit(req->vdi_state.new_vid, sys->vdi_inuse); add_vdi_state(req->vdi_state.new_vid, req->vdi_state.copies, false); return SD_RES_SUCCESS; } static int cluster_notify_vdi_del(const struct sd_req *req, struct sd_rsp *rsp, void *data) { uint32_t vid = *(uint32_t *)data; return objlist_cache_cleanup(vid); } static int cluster_delete_cache(const struct sd_req *req, struct sd_rsp *rsp, void *data) { uint32_t vid = oid_to_vid(req->obj.oid); if (sys->enable_object_cache) object_cache_delete(vid); return SD_RES_SUCCESS; } static int cluster_recovery_completion(const struct sd_req *req, struct sd_rsp *rsp, void *data) { static struct sd_node recovereds[SD_MAX_NODES], *node; static size_t nr_recovereds; static int latest_epoch; struct vnode_info *vnode_info; int i; uint32_t epoch = req->obj.tgt_epoch; node = (struct sd_node *)data; if (latest_epoch > epoch) return SD_RES_SUCCESS; if (latest_epoch < epoch) { sd_debug("new epoch %d", epoch); latest_epoch = epoch; nr_recovereds = 0; } recovereds[nr_recovereds++] = *node; xqsort(recovereds, nr_recovereds, node_cmp); sd_debug("%s is recovered at epoch %d", node_to_str(node), epoch); for (i = 0; i < nr_recovereds; i++) sd_debug("[%x] %s", i, node_to_str(recovereds + i)); if (sys->cinfo.epoch != latest_epoch) return SD_RES_SUCCESS; vnode_info = get_vnode_info(); if (vnode_info->nr_nodes == nr_recovereds) { for (i = 0; i < nr_recovereds; ++i) { if (!node_eq(vnode_info->nodes + i, recovereds + i)) break; } if (i == nr_recovereds) { sd_debug("all nodes are recovered, epoch %d", epoch); /* sd_store can be NULL if this node is a gateway */ if (sd_store && sd_store->cleanup) sd_store->cleanup(); } } put_vnode_info(vnode_info); return SD_RES_SUCCESS; } static bool node_size_varied(void) { uint64_t new, used, old = sys->this_node.space; double diff; if (sys->gateway_only) return false; new = md_get_size(&used); /* If !old, it is forced-out-gateway. Not supported by current node */ if (!old) { if (new) return true; else return false; } diff = new > old ? (double)(new - old) : (double)(old - new); sd_debug("new %"PRIu64 ", old %"PRIu64", ratio %f", new, old, diff / (double)old); if (diff / (double)old < 0.01) return false; sys->this_node.space = new; set_node_space(new); return true; } static int cluster_reweight(const struct sd_req *req, struct sd_rsp *rsp, void *data) { if (node_size_varied()) return sys->cdrv->update_node(&sys->this_node); return SD_RES_SUCCESS; } static int local_md_info(struct request *request) { struct sd_rsp *rsp = &request->rp; assert(request->rq.data_length == sizeof(struct sd_md_info)); rsp->data_length = md_get_info((struct sd_md_info *)request->data); return rsp->data_length ? SD_RES_SUCCESS : SD_RES_UNKNOWN; } static int local_md_plug(const struct sd_req *req, struct sd_rsp *rsp, void *data) { char *disks = (char *)data; return md_plug_disks(disks); } static int local_md_unplug(const struct sd_req *req, struct sd_rsp *rsp, void *data) { char *disks = (char *)data; return md_unplug_disks(disks); } static int local_get_hash(struct request *request) { struct sd_req *req = &request->rq; struct sd_rsp *rsp = &request->rp; if (!sd_store->get_hash) return SD_RES_NO_SUPPORT; return sd_store->get_hash(req->obj.oid, req->obj.tgt_epoch, rsp->hash.digest); } static int local_get_cache_info(struct request *request) { struct sd_rsp *rsp = &request->rp; assert(request->rq.data_length == sizeof(struct object_cache_info)); rsp->data_length = object_cache_get_info((struct object_cache_info *) request->data); return SD_RES_SUCCESS; } /* Return SD_RES_INVALID_PARMS to ask client not to send flush req again */ static int local_flush_vdi(struct request *req) { int ret = SD_RES_INVALID_PARMS; if (sys->enable_object_cache) { uint32_t vid = oid_to_vid(req->rq.obj.oid); ret = object_cache_flush_vdi(vid); } return ret; } static int local_discard_obj(struct request *req) { uint64_t oid = req->rq.obj.oid; uint32_t vid = oid_to_vid(oid), zero = 0; int ret, idx = data_oid_to_idx(oid); sd_debug("%"PRIx64, oid); ret = write_object(vid_to_vdi_oid(vid), (char *)&zero, sizeof(zero), SD_INODE_HEADER_SIZE + sizeof(vid) * idx, false); if (ret != SD_RES_SUCCESS) return ret; if (remove_object(oid) != SD_RES_SUCCESS) sd_err("failed to remove %"PRIx64, oid); /* * Return success even if remove_object fails because we have updated * inode successfully. */ return SD_RES_SUCCESS; } static int local_flush_and_del(struct request *req) { if (!sys->enable_object_cache) return SD_RES_SUCCESS; return object_cache_flush_and_del(req); } static int local_trace_enable(const struct sd_req *req, struct sd_rsp *rsp, void *data) { return trace_enable(data); } static int local_trace_disable(const struct sd_req *req, struct sd_rsp *rsp, void *data) { return trace_disable(data); } static int local_trace_status(const struct sd_req *req, struct sd_rsp *rsp, void *data) { rsp->data_length = trace_status(data); return SD_RES_SUCCESS; } static int local_trace_read_buf(struct request *request) { struct sd_req *req = &request->rq; struct sd_rsp *rsp = &request->rp; int ret; ret = trace_buffer_pop(request->data, req->data_length); if (ret == -1) return SD_RES_AGAIN; rsp->data_length = ret; sd_debug("%u", rsp->data_length); return SD_RES_SUCCESS; } static int local_kill_node(const struct sd_req *req, struct sd_rsp *rsp, void *data) { sys->cinfo.status = SD_STATUS_KILLED; return SD_RES_SUCCESS; } static int read_copy_from_replica(struct request *req, uint32_t epoch, uint64_t oid, char *buf) { struct request read_req = { }; struct sd_req *hdr = &read_req.rq; struct sd_rsp *rsp = &read_req.rp; int ret; /* Create a fake gateway read request */ sd_init_req(hdr, SD_OP_READ_OBJ); hdr->data_length = SD_DATA_OBJ_SIZE; hdr->epoch = epoch; hdr->obj.oid = oid; hdr->obj.offset = 0; hdr->obj.copies = get_req_copy_number(req); read_req.data = buf; read_req.op = get_sd_op(hdr->opcode); read_req.vinfo = req->vinfo; ret = gateway_read_obj(&read_req); if (ret == SD_RES_SUCCESS) untrim_zero_blocks(buf, rsp->obj.offset, rsp->data_length, SD_DATA_OBJ_SIZE); return ret; } int peer_remove_obj(struct request *req) { uint64_t oid = req->rq.obj.oid; objlist_cache_remove(oid); return sd_store->remove_object(oid); } int peer_read_obj(struct request *req) { struct sd_req *hdr = &req->rq; struct sd_rsp *rsp = &req->rp; int ret; uint32_t epoch = hdr->epoch; struct siocb iocb; if (sys->gateway_only) return SD_RES_NO_OBJ; memset(&iocb, 0, sizeof(iocb)); iocb.epoch = epoch; iocb.buf = req->data; iocb.length = hdr->data_length; iocb.offset = hdr->obj.offset; ret = sd_store->read(hdr->obj.oid, &iocb); if (ret != SD_RES_SUCCESS) goto out; rsp->data_length = hdr->data_length; rsp->obj.offset = 0; trim_zero_blocks(req->data, &rsp->obj.offset, &rsp->data_length); if (hdr->obj.copies) rsp->obj.copies = hdr->obj.copies; else rsp->obj.copies = get_obj_copy_number(hdr->obj.oid, req->vinfo->nr_zones); out: return ret; } static int do_create_and_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch, void *data) { iocb->buf = data; iocb->length = hdr->data_length; iocb->offset = hdr->obj.offset; return sd_store->create_and_write(hdr->obj.oid, iocb); } int peer_write_obj(struct request *req) { struct sd_req *hdr = &req->rq; struct siocb iocb = { }; uint64_t oid = hdr->obj.oid; iocb.epoch = hdr->epoch; iocb.buf = req->data; iocb.length = hdr->data_length; iocb.offset = hdr->obj.offset; return sd_store->write(oid, &iocb); } int peer_create_and_write_obj(struct request *req) { struct sd_req *hdr = &req->rq; struct sd_req cow_hdr; uint32_t epoch = hdr->epoch; uint64_t oid = hdr->obj.oid; char *buf = NULL; struct siocb iocb; int ret = SD_RES_SUCCESS; memset(&iocb, 0, sizeof(iocb)); iocb.epoch = epoch; iocb.length = get_objsize(oid); if (hdr->flags & SD_FLAG_CMD_COW) { sd_debug("%" PRIx64 ", %" PRIx64, oid, hdr->obj.cow_oid); buf = xvalloc(SD_DATA_OBJ_SIZE); if (hdr->data_length != SD_DATA_OBJ_SIZE) { ret = read_copy_from_replica(req, hdr->epoch, hdr->obj.cow_oid, buf); if (ret != SD_RES_SUCCESS) { sd_err("failed to read cow object"); goto out; } } memcpy(buf + hdr->obj.offset, req->data, hdr->data_length); memcpy(&cow_hdr, hdr, sizeof(cow_hdr)); cow_hdr.data_length = SD_DATA_OBJ_SIZE; cow_hdr.obj.offset = 0; trim_zero_blocks(buf, &cow_hdr.obj.offset, &cow_hdr.data_length); ret = do_create_and_write_obj(&iocb, &cow_hdr, epoch, buf); } else ret = do_create_and_write_obj(&iocb, hdr, epoch, req->data); if (SD_RES_SUCCESS == ret) objlist_cache_insert(oid); out: if (buf) free(buf); return ret; } static struct sd_op_template sd_ops[] = { /* cluster operations */ [SD_OP_NEW_VDI] = { .name = "NEW_VDI", .type = SD_OP_TYPE_CLUSTER, .process_work = cluster_new_vdi, .process_main = post_cluster_new_vdi, }, [SD_OP_DEL_VDI] = { .name = "DEL_VDI", .type = SD_OP_TYPE_CLUSTER, .process_work = cluster_del_vdi, .process_main = post_cluster_del_vdi, }, [SD_OP_MAKE_FS] = { .name = "MAKE_FS", .type = SD_OP_TYPE_CLUSTER, .force = true, .process_main = cluster_make_fs, }, [SD_OP_SHUTDOWN] = { .name = "SHUTDOWN", .type = SD_OP_TYPE_CLUSTER, .force = true, .process_main = cluster_shutdown, }, [SD_OP_GET_VDI_ATTR] = { .name = "GET_VDI_ATTR", .type = SD_OP_TYPE_CLUSTER, .process_work = cluster_get_vdi_attr, }, [SD_OP_FORCE_RECOVER] = { .name = "FORCE_RECOVER", .type = SD_OP_TYPE_CLUSTER, .force = true, .process_work = cluster_force_recover_work, .process_main = cluster_force_recover_main, }, [SD_OP_CLEANUP] = { .name = "CLEANUP", .type = SD_OP_TYPE_CLUSTER, .force = true, .process_main = cluster_cleanup, }, [SD_OP_NOTIFY_VDI_DEL] = { .name = "NOTIFY_VDI_DEL", .type = SD_OP_TYPE_CLUSTER, .force = true, .process_main = cluster_notify_vdi_del, }, [SD_OP_NOTIFY_VDI_ADD] = { .name = "NOTIFY_VDI_ADD", .type = SD_OP_TYPE_CLUSTER, .force = true, .process_main = cluster_notify_vdi_add, }, [SD_OP_DELETE_CACHE] = { .name = "DELETE_CACHE", .type = SD_OP_TYPE_CLUSTER, .process_main = cluster_delete_cache, }, [SD_OP_COMPLETE_RECOVERY] = { .name = "COMPLETE_RECOVERY", .type = SD_OP_TYPE_CLUSTER, .force = true, .process_main = cluster_recovery_completion, }, [SD_OP_GET_VDI_INFO] = { .name = "GET_VDI_INFO", .type = SD_OP_TYPE_CLUSTER, .process_work = cluster_get_vdi_info, }, [SD_OP_LOCK_VDI] = { .name = "LOCK_VDI", .type = SD_OP_TYPE_CLUSTER, .process_work = cluster_get_vdi_info, }, [SD_OP_REWEIGHT] = { .name = "REWEIGHT", .type = SD_OP_TYPE_CLUSTER, .process_main = cluster_reweight, }, [SD_OP_ENABLE_RECOVER] = { .name = "ENABLE_RECOVER", .type = SD_OP_TYPE_CLUSTER, .process_main = cluster_enable_recover, }, [SD_OP_DISABLE_RECOVER] = { .name = "DISABLE_RECOVER", .type = SD_OP_TYPE_CLUSTER, .process_main = cluster_disable_recover, }, /* local operations */ [SD_OP_RELEASE_VDI] = { .name = "RELEASE_VDI", .type = SD_OP_TYPE_LOCAL, .process_work = local_release_vdi, }, [SD_OP_GET_STORE_LIST] = { .name = "GET_STORE_LIST", .type = SD_OP_TYPE_LOCAL, .force = true, .process_work = local_get_store_list, }, [SD_OP_READ_VDIS] = { .name = "READ_VDIS", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_read_vdis, }, [SD_OP_GET_VDI_COPIES] = { .name = "GET_VDI_COPIES", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_get_vdi_copies, }, [SD_OP_GET_NODE_LIST] = { .name = "GET_NODE_LIST", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_get_node_list, }, [SD_OP_STAT_SHEEP] = { .name = "STAT_SHEEP", .type = SD_OP_TYPE_LOCAL, .process_work = local_stat_sheep, }, [SD_OP_STAT_RECOVERY] = { .name = "STAT_RECOVERY", .type = SD_OP_TYPE_LOCAL, .process_main = local_stat_recovery, }, [SD_OP_STAT_CLUSTER] = { .name = "STAT_CLUSTER", .type = SD_OP_TYPE_LOCAL, .force = true, .process_work = local_stat_cluster, }, [SD_OP_GET_OBJ_LIST] = { .name = "GET_OBJ_LIST", .type = SD_OP_TYPE_LOCAL, .process_work = local_get_obj_list, }, [SD_OP_GET_EPOCH] = { .name = "GET_EPOCH", .type = SD_OP_TYPE_LOCAL, .process_work = local_get_epoch, }, [SD_OP_FLUSH_VDI] = { .name = "FLUSH_VDI", .type = SD_OP_TYPE_LOCAL, .process_work = local_flush_vdi, }, [SD_OP_DISCARD_OBJ] = { .name = "DISCARD_OBJ", .type = SD_OP_TYPE_LOCAL, .process_work = local_discard_obj, }, [SD_OP_FLUSH_DEL_CACHE] = { .name = "DEL_CACHE", .type = SD_OP_TYPE_LOCAL, .process_work = local_flush_and_del, }, [SD_OP_TRACE_ENABLE] = { .name = "TRACE_ENABLE", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_trace_enable, }, [SD_OP_TRACE_DISABLE] = { .name = "TRACE_DISABLE", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_trace_disable, }, [SD_OP_TRACE_STATUS] = { .name = "TRACE_STATUS", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_trace_status, }, [SD_OP_TRACE_READ_BUF] = { .name = "TRACE_READ_BUF", .type = SD_OP_TYPE_LOCAL, .force = true, .process_work = local_trace_read_buf, }, [SD_OP_KILL_NODE] = { .name = "KILL_NODE", .type = SD_OP_TYPE_LOCAL, .force = true, .process_main = local_kill_node, }, [SD_OP_MD_INFO] = { .name = "MD_INFO", .type = SD_OP_TYPE_LOCAL, .process_work = local_md_info, }, [SD_OP_MD_PLUG] = { .name = "MD_PLUG_DISKS", .type = SD_OP_TYPE_LOCAL, .process_main = local_md_plug, }, [SD_OP_MD_UNPLUG] = { .name = "MD_UNPLUG_DISKS", .type = SD_OP_TYPE_LOCAL, .process_main = local_md_unplug, }, [SD_OP_GET_HASH] = { .name = "GET_HASH", .type = SD_OP_TYPE_LOCAL, .process_work = local_get_hash, }, [SD_OP_GET_CACHE_INFO] = { .name = "GET_CACHE_INFO", .type = SD_OP_TYPE_LOCAL, .process_work = local_get_cache_info, }, /* gateway I/O operations */ [SD_OP_CREATE_AND_WRITE_OBJ] = { .name = "CREATE_AND_WRITE_OBJ", .type = SD_OP_TYPE_GATEWAY, .process_work = gateway_create_and_write_obj, }, [SD_OP_READ_OBJ] = { .name = "READ_OBJ", .type = SD_OP_TYPE_GATEWAY, .process_work = gateway_read_obj, }, [SD_OP_WRITE_OBJ] = { .name = "WRITE_OBJ", .type = SD_OP_TYPE_GATEWAY, .process_work = gateway_write_obj, }, [SD_OP_REMOVE_OBJ] = { .name = "REMOVE_OBJ", .type = SD_OP_TYPE_GATEWAY, .process_work = gateway_remove_obj, }, /* peer I/O operations */ [SD_OP_CREATE_AND_WRITE_PEER] = { .name = "CREATE_AND_WRITE_PEER", .type = SD_OP_TYPE_PEER, .process_work = peer_create_and_write_obj, }, [SD_OP_READ_PEER] = { .name = "READ_PEER", .type = SD_OP_TYPE_PEER, .process_work = peer_read_obj, }, [SD_OP_WRITE_PEER] = { .name = "WRITE_PEER", .type = SD_OP_TYPE_PEER, .process_work = peer_write_obj, }, [SD_OP_REMOVE_PEER] = { .name = "REMOVE_PEER", .type = SD_OP_TYPE_PEER, .process_work = peer_remove_obj, }, }; const struct sd_op_template *get_sd_op(uint8_t opcode) { if (sd_ops[opcode].type == 0) return NULL; return sd_ops + opcode; } const char *op_name(const struct sd_op_template *op) { return op->name; } bool is_cluster_op(const struct sd_op_template *op) { return op->type == SD_OP_TYPE_CLUSTER; } bool is_local_op(const struct sd_op_template *op) { return op->type == SD_OP_TYPE_LOCAL; } bool is_peer_op(const struct sd_op_template *op) { return op->type == SD_OP_TYPE_PEER; } bool is_gateway_op(const struct sd_op_template *op) { return op->type == SD_OP_TYPE_GATEWAY; } bool is_force_op(const struct sd_op_template *op) { return !!op->force; } bool has_process_work(const struct sd_op_template *op) { return !!op->process_work; } bool has_process_main(const struct sd_op_template *op) { return !!op->process_main; } void do_process_work(struct work *work) { struct request *req = container_of(work, struct request, work); int ret = SD_RES_SUCCESS; sd_debug("%x, %" PRIx64", %"PRIu32, req->rq.opcode, req->rq.obj.oid, req->rq.epoch); if (req->op->process_work) ret = req->op->process_work(req); if (ret != SD_RES_SUCCESS) { sd_debug("failed: %x, %" PRIx64" , %u, %s", req->rq.opcode, req->rq.obj.oid, req->rq.epoch, sd_strerror(ret)); } req->rp.result = ret; } int do_process_main(const struct sd_op_template *op, const struct sd_req *req, struct sd_rsp *rsp, void *data) { return op->process_main(req, rsp, data); } int sheep_do_op_work(const struct sd_op_template *op, struct request *req) { return op->process_work(req); } static int map_table[] = { [SD_OP_CREATE_AND_WRITE_OBJ] = SD_OP_CREATE_AND_WRITE_PEER, [SD_OP_READ_OBJ] = SD_OP_READ_PEER, [SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER, [SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER, }; int gateway_to_peer_opcode(int opcode) { assert(opcode < ARRAY_SIZE(map_table)); return map_table[opcode]; } sheepdog-0.7.5/sheep/plain_store.c000066400000000000000000000316271223630776600171350ustar00rootroot00000000000000/* * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "sheep_priv.h" #define sector_algined(x) ({ ((x) & (SECTOR_SIZE - 1)) == 0; }) static inline bool iocb_is_aligned(const struct siocb *iocb) { return sector_algined(iocb->offset) && sector_algined(iocb->length); } static int prepare_iocb(uint64_t oid, const struct siocb *iocb, bool create) { int flags = O_DSYNC | O_RDWR; if (uatomic_is_true(&sys->use_journal) || sys->nosync == true) flags &= ~O_DSYNC; if (sys->backend_dio && iocb_is_aligned(iocb)) { assert(is_aligned_to_pagesize(iocb->buf)); flags |= O_DIRECT; } if (create) flags |= O_CREAT | O_EXCL; return flags; } static int get_obj_path(uint64_t oid, char *path) { return snprintf(path, PATH_MAX, "%s/%016" PRIx64, md_get_object_path(oid), oid); } static int get_tmp_obj_path(uint64_t oid, char *path) { return snprintf(path, PATH_MAX, "%s/%016"PRIx64".tmp", md_get_object_path(oid), oid); } static int get_stale_obj_path(uint64_t oid, uint32_t epoch, char *path) { return md_get_stale_path(oid, epoch, path); } bool default_exist(uint64_t oid) { return md_exist(oid); } static int err_to_sderr(char *path, uint64_t oid, int err) { struct stat s; char *dir = dirname(path); sd_debug("%s", dir); switch (err) { case ENOENT: if (stat(dir, &s) < 0) { sd_err("%s corrupted", dir); return md_handle_eio(dir); } sd_debug("object %016" PRIx64 " not found locally", oid); return SD_RES_NO_OBJ; case ENOSPC: /* TODO: stop automatic recovery */ sd_err("diskfull, oid=%"PRIx64, oid); return SD_RES_NO_SPACE; case EMFILE: case ENFILE: case EINTR: case EAGAIN: case EEXIST: sd_err("%m, oid=%"PRIx64, oid); /* make gateway try again */ return SD_RES_NETWORK_ERROR; default: sd_err("oid=%"PRIx64", %m", oid); return md_handle_eio(dir); } } int default_write(uint64_t oid, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; char path[PATH_MAX]; ssize_t size; if (iocb->epoch < sys_epoch()) { sd_debug("%"PRIu32" sys %"PRIu32, iocb->epoch, sys_epoch()); return SD_RES_OLD_NODE_VER; } if (uatomic_is_true(&sys->use_journal) && unlikely(journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, false)) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } get_obj_path(oid, path); fd = open(path, flags, sd_def_fmode); if (unlikely(fd < 0)) return err_to_sderr(path, oid, errno); size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); if (unlikely(size != iocb->length)) { sd_err("failed to write object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); goto out; } out: close(fd); return ret; } static int make_stale_dir(char *path) { char p[PATH_MAX]; snprintf(p, PATH_MAX, "%s/.stale", path); if (xmkdir(p, sd_def_dmode) < 0) { sd_err("%s failed, %m", p); return SD_RES_EIO; } return SD_RES_SUCCESS; } static int purge_dir(char *path) { if (purge_directory(path) < 0) return SD_RES_EIO; return SD_RES_SUCCESS; } static int purge_stale_dir(char *path) { char p[PATH_MAX]; snprintf(p, PATH_MAX, "%s/.stale", path); return purge_dir(p); } int default_cleanup(void) { int ret; ret = for_each_obj_path(purge_stale_dir); if (ret != SD_RES_SUCCESS) return ret; return SD_RES_SUCCESS; } static int init_vdi_state(uint64_t oid, char *wd, uint32_t epoch) { int ret; struct sd_inode *inode = xzalloc(SD_INODE_HEADER_SIZE); struct siocb iocb = { .epoch = epoch, .buf = inode, .length = SD_INODE_HEADER_SIZE, }; ret = default_read(oid, &iocb); if (ret != SD_RES_SUCCESS) { sd_err("failed to read inode header %" PRIx64 " %" PRId32, oid, epoch); goto out; } add_vdi_state(oid_to_vid(oid), inode->nr_copies, vdi_is_snapshot(inode)); atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse); ret = SD_RES_SUCCESS; out: free(inode); return SD_RES_SUCCESS; } static int init_objlist_and_vdi_bitmap(uint64_t oid, char *wd, uint32_t epoch, void *arg) { int ret; objlist_cache_insert(oid); if (is_vdi_obj(oid)) { sd_debug("found the VDI object %" PRIx64, oid); ret = init_vdi_state(oid, wd, epoch); if (ret != SD_RES_SUCCESS) return ret; } return SD_RES_SUCCESS; } int default_init(void) { int ret; sd_debug("use plain store driver"); ret = for_each_obj_path(make_stale_dir); if (ret != SD_RES_SUCCESS) return ret; for_each_object_in_stale(init_objlist_and_vdi_bitmap, NULL); return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL); } static int default_read_from_path(uint64_t oid, char *path, const struct siocb *iocb) { int flags = prepare_iocb(oid, iocb, false), fd, ret = SD_RES_SUCCESS; ssize_t size; fd = open(path, flags); if (fd < 0) return err_to_sderr(path, oid, errno); size = xpread(fd, iocb->buf, iocb->length, iocb->offset); if (unlikely(size != iocb->length)) { sd_err("failed to read object %"PRIx64", path=%s, offset=%" PRId64", size=%"PRId32", result=%zd, %m", oid, path, iocb->offset, iocb->length, size); ret = err_to_sderr(path, oid, errno); } close(fd); return ret; } int default_read(uint64_t oid, const struct siocb *iocb) { int ret; char path[PATH_MAX]; get_obj_path(oid, path); ret = default_read_from_path(oid, path, iocb); /* * If the request is againt the older epoch, try to read from * the stale directory */ if (ret == SD_RES_NO_OBJ && iocb->epoch > 0 && iocb->epoch < sys_epoch()) { get_stale_obj_path(oid, iocb->epoch, path); ret = default_read_from_path(oid, path, iocb); } return ret; } /* Preallocate the whole object to get a better filesystem layout. */ int prealloc(int fd, uint32_t size) { int ret = xfallocate(fd, 0, 0, size); if (ret < 0) { if (errno != ENOSYS && errno != EOPNOTSUPP) { sd_err("failed to preallocate space, %m"); return ret; } return xftruncate(fd, size); } return 0; } int default_create_and_write(uint64_t oid, const struct siocb *iocb) { char path[PATH_MAX], tmp_path[PATH_MAX]; int flags = prepare_iocb(oid, iocb, true); int ret, fd; uint32_t len = iocb->length; get_obj_path(oid, path); get_tmp_obj_path(oid, tmp_path); if (uatomic_is_true(&sys->use_journal) && journal_write_store(oid, iocb->buf, iocb->length, iocb->offset, true) != SD_RES_SUCCESS) { sd_err("turn off journaling"); uatomic_set_false(&sys->use_journal); flags |= O_DSYNC; sync(); } fd = open(tmp_path, flags, sd_def_fmode); if (fd < 0) { if (errno == EEXIST) { /* * This happens if node membership changes during object * creation; while gateway retries a CREATE request, * recovery process could also recover the object at the * same time. They should try to write the same date, * so it is okay to simply return success here. */ sd_debug("%s exists", tmp_path); return SD_RES_SUCCESS; } sd_err("failed to open %s: %m", tmp_path); return err_to_sderr(path, oid, errno); } if (iocb->offset != 0 || iocb->length != get_objsize(oid)) { ret = prealloc(fd, get_objsize(oid)); if (ret < 0) { ret = err_to_sderr(path, oid, errno); goto out; } } ret = xpwrite(fd, iocb->buf, len, iocb->offset); if (ret != len) { sd_err("failed to write object. %m"); ret = err_to_sderr(path, oid, errno); goto out; } ret = rename(tmp_path, path); if (ret < 0) { sd_err("failed to rename %s to %s: %m", tmp_path, path); ret = err_to_sderr(path, oid, errno); goto out; } sd_debug("%"PRIx64, oid); ret = SD_RES_SUCCESS; out: if (ret != SD_RES_SUCCESS) unlink(tmp_path); close(fd); return ret; } int default_link(uint64_t oid, uint32_t tgt_epoch) { char path[PATH_MAX], stale_path[PATH_MAX]; sd_debug("try link %"PRIx64" from snapshot with epoch %d", oid, tgt_epoch); get_obj_path(oid, path); get_stale_obj_path(oid, tgt_epoch, stale_path); if (link(stale_path, path) < 0) { /* * Recovery thread and main thread might try to recover the * same object and we might get EEXIST in such case. */ if (errno == EEXIST) goto out; sd_debug("failed to link from %s to %s, %m", stale_path, path); return err_to_sderr(path, oid, errno); } out: return SD_RES_SUCCESS; } static bool oid_stale(uint64_t oid) { int i, nr_copies; struct vnode_info *vinfo; const struct sd_vnode *v; bool ret = true; const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; vinfo = get_vnode_info(); nr_copies = get_obj_copy_number(oid, vinfo->nr_zones); if (!nr_copies) { ret = false; goto out; } oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (vnode_is_local(v)) { ret = false; break; } } out: put_vnode_info(vinfo); return ret; } static int move_object_to_stale_dir(uint64_t oid, char *wd, uint32_t epoch, void *arg) { char path[PATH_MAX], stale_path[PATH_MAX]; uint32_t tgt_epoch = *(int *)arg; snprintf(path, PATH_MAX, "%s/%016" PRIx64, wd, oid); snprintf(stale_path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32, wd, oid, tgt_epoch); if (unlikely(rename(path, stale_path)) < 0) { sd_err("failed to move stale object %" PRIX64 " to %s, %m", oid, path); return SD_RES_EIO; } sd_debug("moved object %"PRIx64, oid); return SD_RES_SUCCESS; } static int check_stale_objects(uint64_t oid, char *wd, uint32_t epoch, void *arg) { if (oid_stale(oid)) return move_object_to_stale_dir(oid, wd, 0, arg); return SD_RES_SUCCESS; } int default_update_epoch(uint32_t epoch) { assert(epoch); return for_each_object_in_wd(check_stale_objects, false, &epoch); } int default_format(void) { unsigned ret; sd_debug("try get a clean store"); ret = for_each_obj_path(purge_dir); if (ret != SD_RES_SUCCESS) return ret; if (sys->enable_object_cache) object_cache_format(); return SD_RES_SUCCESS; } int default_remove_object(uint64_t oid) { char path[PATH_MAX]; if (uatomic_is_true(&sys->use_journal)) journal_remove_object(oid); get_obj_path(oid, path); if (unlink(path) < 0) { if (errno == ENOENT) return SD_RES_NO_OBJ; sd_err("failed to remove object %"PRIx64", %m", oid); return SD_RES_EIO; } return SD_RES_SUCCESS; } #define SHA1NAME "user.obj.sha1" static int get_object_sha1(char *path, uint8_t *sha1) { if (getxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE) != SHA1_DIGEST_SIZE) { if (errno == ENODATA) sd_debug("sha1 is not cached yet, %s", path); else sd_err("fail to get xattr, %s", path); return -1; } return 0; } static int set_object_sha1(char *path, const uint8_t *sha1) { int ret; ret = setxattr(path, SHA1NAME, sha1, SHA1_DIGEST_SIZE, 0); if (ret < 0) sd_err("fail to set sha1, %s", path); return ret; } static int get_object_path(uint64_t oid, uint32_t epoch, char *path) { if (default_exist(oid)) { get_obj_path(oid, path); } else { get_stale_obj_path(oid, epoch, path); if (access(path, F_OK) < 0) { if (errno == ENOENT) return SD_RES_NO_OBJ; return SD_RES_EIO; } } return SD_RES_SUCCESS; } int default_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1) { int ret; void *buf; struct siocb iocb = {}; uint32_t length; bool is_readonly_obj = oid_is_readonly(oid); char path[PATH_MAX]; ret = get_object_path(oid, epoch, path); if (ret != SD_RES_SUCCESS) return ret; if (is_readonly_obj) { if (get_object_sha1(path, sha1) == 0) { sd_debug("use cached sha1 digest %s", sha1_to_hex(sha1)); return SD_RES_SUCCESS; } } length = get_objsize(oid); buf = valloc(length); if (buf == NULL) return SD_RES_NO_MEM; iocb.epoch = epoch; iocb.buf = buf; iocb.length = length; ret = default_read_from_path(oid, path, &iocb); if (ret != SD_RES_SUCCESS) { free(buf); return ret; } sha1_from_buffer(buf, length, sha1); free(buf); sd_debug("the message digest of %"PRIx64" at epoch %d is %s", oid, epoch, sha1_to_hex(sha1)); if (is_readonly_obj) set_object_sha1(path, sha1); return ret; } int default_purge_obj(void) { uint32_t tgt_epoch = get_latest_epoch(); return for_each_object_in_wd(move_object_to_stale_dir, true, &tgt_epoch); } static struct store_driver plain_store = { .name = "plain", .init = default_init, .exist = default_exist, .create_and_write = default_create_and_write, .write = default_write, .read = default_read, .link = default_link, .update_epoch = default_update_epoch, .cleanup = default_cleanup, .format = default_format, .remove_object = default_remove_object, .get_hash = default_get_hash, .purge_obj = default_purge_obj, }; add_store_driver(plain_store); sheepdog-0.7.5/sheep/recovery.c000066400000000000000000000525531223630776600164550ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "sheep_priv.h" /* base structure for the recovery thread */ struct recovery_work { uint32_t epoch; uint32_t tgt_epoch; struct vnode_info *old_vinfo; struct vnode_info *cur_vinfo; struct work work; }; /* for preparing lists */ struct recovery_list_work { struct recovery_work base; uint64_t count; uint64_t *oids; }; /* for recoverying objects */ struct recovery_obj_work { struct recovery_work base; uint64_t oid; /* the object to be recovered */ bool stop; /* local replica in the stale directory */ uint32_t local_epoch; uint8_t local_sha1[SHA1_DIGEST_SIZE]; }; /* * recovery information * * We cannot access the members of this structure outside of the main thread. */ struct recovery_info { enum rw_state state; uint32_t epoch; uint32_t tgt_epoch; uint64_t done; /* * true when automatic recovery is disabled * and no recovery work is running */ bool suspended; bool notify_complete; uint64_t count; uint64_t *oids; uint64_t *prio_oids; uint64_t nr_prio_oids; uint64_t nr_scheduled_prio_oids; struct vnode_info *old_vinfo; struct vnode_info *cur_vinfo; }; static struct recovery_info *next_rinfo; static main_thread(struct recovery_info *) current_rinfo; static void queue_recovery_work(struct recovery_info *rinfo); /* Dynamically grown list buffer default as 4M (2T storage) */ #define DEFAULT_LIST_BUFFER_SIZE (UINT64_C(1) << 22) static size_t list_buffer_size = DEFAULT_LIST_BUFFER_SIZE; static int obj_cmp(const uint64_t *oid1, const uint64_t *oid2) { const uint64_t hval1 = fnv_64a_buf(oid1, sizeof(*oid1), FNV1A_64_INIT); const uint64_t hval2 = fnv_64a_buf(oid2, sizeof(*oid2), FNV1A_64_INIT); return intcmp(hval1, hval2); } static inline bool node_is_gateway_only(void) { return sys->this_node.nr_vnodes == 0; } /* recover object from vnode */ static int recover_object_from(struct recovery_obj_work *row, const struct sd_node *node, uint32_t tgt_epoch) { uint64_t oid = row->oid; uint32_t local_epoch = row->local_epoch; uint8_t *sha1 = row->local_sha1; uint32_t epoch = row->base.epoch; int ret; unsigned rlen; void *buf = NULL; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; struct siocb iocb = { 0 }; if (node_is_local(node)) { if (tgt_epoch < sys_epoch()) return sd_store->link(oid, tgt_epoch); return SD_RES_NO_OBJ; } /* compare sha1 hash value first */ if (local_epoch > 0) { sd_init_req(&hdr, SD_OP_GET_HASH); hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = sheep_exec_req(&node->nid, &hdr, NULL); if (ret != SD_RES_SUCCESS) return ret; if (memcmp(rsp->hash.digest, sha1, sizeof(SHA1_DIGEST_SIZE)) == 0) { sd_debug("use local replica at epoch %d", local_epoch); ret = sd_store->link(oid, local_epoch); if (ret == SD_RES_SUCCESS) return ret; } } rlen = get_objsize(oid); buf = xvalloc(rlen); /* recover from remote replica */ sd_init_req(&hdr, SD_OP_READ_PEER); hdr.epoch = epoch; hdr.flags = SD_FLAG_CMD_RECOVERY; hdr.data_length = rlen; hdr.obj.oid = oid; hdr.obj.tgt_epoch = tgt_epoch; ret = sheep_exec_req(&node->nid, &hdr, buf); if (ret == SD_RES_SUCCESS) { iocb.epoch = epoch; iocb.length = rsp->data_length; iocb.offset = rsp->obj.offset; iocb.buf = buf; ret = sd_store->create_and_write(oid, &iocb); } free(buf); return ret; } /* * A node that does not match any node in current node list means the node has * left the cluster, then it's an invalid node. */ static bool invalid_node(const struct sd_node *n, struct vnode_info *info) { if (xbsearch(n, info->nodes, info->nr_nodes, node_cmp)) return false; return true; } static int recover_object_from_replica(struct recovery_obj_work *row, struct vnode_info *old, uint32_t tgt_epoch) { uint64_t oid = row->oid; uint32_t epoch = row->base.epoch; int nr_copies, ret = SD_RES_SUCCESS, start = 0; bool fully_replicated = true; nr_copies = get_obj_copy_number(oid, old->nr_zones); /* find local node first to try to recover from local */ for (int i = 0; i < nr_copies; i++) { const struct sd_vnode *vnode; vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i); if (vnode_is_local(vnode)) { start = i; break; } } /* Let's do a breadth-first search */ for (int i = 0; i < nr_copies; i++) { const struct sd_node *node; int idx = (i + start) % nr_copies; node = oid_to_node(old->vnodes, old->nr_vnodes, oid, idx, old->nodes); if (invalid_node(node, row->base.cur_vinfo)) continue; ret = recover_object_from(row, node, tgt_epoch); switch (ret) { case SD_RES_SUCCESS: sd_debug("recovered oid %"PRIx64" from %d to epoch %d", oid, tgt_epoch, epoch); objlist_cache_insert(oid); return ret; case SD_RES_OLD_NODE_VER: /* move to the next epoch recovery */ return ret; case SD_RES_NO_OBJ: fully_replicated = false; /* fall through */ default: break; } } /* * sheep would return a stale object when * - all the nodes hold the copies, and * - all the nodes are gone * at the some epoch */ if (fully_replicated && ret != SD_RES_SUCCESS) ret = SD_RES_STALE_OBJ; return ret; } /* * Recover the object from its track in epoch history. That is, * the routine will try to recovery it from the nodes it has stayed, * at least, *theoretically* on consistent hash ring. */ static int do_recover_object(struct recovery_obj_work *row) { struct recovery_work *rw = &row->base; struct vnode_info *old; uint64_t oid = row->oid; uint32_t tgt_epoch = rw->tgt_epoch; int ret; struct vnode_info *new_old; old = grab_vnode_info(rw->old_vinfo); again: sd_debug("try recover object %"PRIx64" from epoch %"PRIu32, oid, tgt_epoch); ret = recover_object_from_replica(row, old, tgt_epoch); switch (ret) { case SD_RES_SUCCESS: /* Succeed */ break; case SD_RES_OLD_NODE_VER: row->stop = true; break; case SD_RES_STALE_OBJ: sd_alert("cannot access any replicas of %"PRIx64" at epoch %d", oid, tgt_epoch); sd_alert("clients may see old data"); /* fall through */ default: /* No luck, roll back to an older configuration and try again */ rollback: tgt_epoch--; if (tgt_epoch < 1) { sd_err("can not recover oid %"PRIx64, oid); ret = -1; break; } new_old = get_vnode_info_epoch(tgt_epoch, rw->cur_vinfo); if (!new_old) { /* We rollback in case we don't get a valid epoch */ sd_alert("cannot get epoch %d", tgt_epoch); sd_alert("clients may see old data"); goto rollback; } put_vnode_info(old); old = new_old; goto again; } put_vnode_info(old); return ret; } static void recover_object_work(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); struct recovery_obj_work *row = container_of(rw, struct recovery_obj_work, base); uint64_t oid = row->oid; int ret, epoch; if (sd_store->exist(oid)) { sd_debug("the object is already recovered"); return; } /* find object in the stale directory */ for (epoch = sys_epoch() - 1; epoch > 0; epoch--) { ret = sd_store->get_hash(oid, epoch, row->local_sha1); if (ret == SD_RES_SUCCESS) { sd_debug("replica found in local at epoch %d", epoch); row->local_epoch = epoch; break; } } ret = do_recover_object(row); if (ret < 0) sd_err("failed to recover object %"PRIx64, oid); } bool node_in_recovery(void) { return main_thread_get(current_rinfo) != NULL; } static inline void prepare_schedule_oid(uint64_t oid) { struct recovery_info *rinfo = main_thread_get(current_rinfo); uint64_t i; for (i = 0; i < rinfo->nr_prio_oids; i++) if (rinfo->prio_oids[i] == oid) return; /* * We need this check because oid might not be recovered. * Very much unlikely though, but it might happen indeed. */ for (i = 0; i < rinfo->done; i++) if (rinfo->oids[i] == oid) { sd_debug("%"PRIx64" not recovered, don't schedule it", oid); return; } /* When recovery is not suspended, oid is currently being recovered */ if (!rinfo->suspended && rinfo->oids[rinfo->done] == oid) return; rinfo->nr_prio_oids++; rinfo->prio_oids = xrealloc(rinfo->prio_oids, rinfo->nr_prio_oids * sizeof(uint64_t)); rinfo->prio_oids[rinfo->nr_prio_oids - 1] = oid; sd_debug("%"PRIx64" nr_prio_oids %"PRIu64, oid, rinfo->nr_prio_oids); resume_suspended_recovery(); } bool oid_in_recovery(uint64_t oid) { struct recovery_info *rinfo = main_thread_get(current_rinfo); uint64_t i; if (!node_in_recovery()) return false; if (sd_store->exist(oid)) { sd_debug("the object %" PRIx64 " is already recoverd", oid); return false; } if (uatomic_read(&next_rinfo)) return true; /* If we are in preparation of object list, oid is not recovered yet */ if (rinfo->state == RW_PREPARE_LIST) return true; /* * Check if oid is in the list that to be recovered later * * FIXME: do we need more efficient yet complex data structure? */ for (i = rinfo->done; i < rinfo->count; i++) if (rinfo->oids[i] == oid) break; /* * Newly created object after prepare_object_list() might not be * in the list */ if (i == rinfo->count) { sd_err("%"PRIx64" is not in the recovery list", oid); return false; } prepare_schedule_oid(oid); return true; } static void free_recovery_work(struct recovery_work *rw) { put_vnode_info(rw->cur_vinfo); put_vnode_info(rw->old_vinfo); free(rw); } static void free_recovery_list_work(struct recovery_list_work *rlw) { put_vnode_info(rlw->base.cur_vinfo); put_vnode_info(rlw->base.old_vinfo); free(rlw->oids); free(rlw); } static void free_recovery_obj_work(struct recovery_obj_work *row) { put_vnode_info(row->base.cur_vinfo); put_vnode_info(row->base.old_vinfo); free(row); } static void free_recovery_info(struct recovery_info *rinfo) { put_vnode_info(rinfo->cur_vinfo); put_vnode_info(rinfo->old_vinfo); free(rinfo->oids); free(rinfo->prio_oids); free(rinfo); } /* Return true if next recovery work is queued. */ static inline bool run_next_rw(void) { struct recovery_info *nrinfo = uatomic_xchg_ptr(&next_rinfo, NULL); struct recovery_info *cur = main_thread_get(current_rinfo); if (nrinfo == NULL) return false; /* * When md recovery supersed the reweight or node recovery, we need to * notify completion. */ if (!nrinfo->notify_complete && cur->notify_complete) nrinfo->notify_complete = true; free_recovery_info(cur); if (!node_is_gateway_only()) sd_store->update_epoch(nrinfo->tgt_epoch); main_thread_set(current_rinfo, nrinfo); wakeup_all_requests(); queue_recovery_work(nrinfo); sd_debug("recovery work is superseded"); return true; } static void notify_recovery_completion_work(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); struct sd_req hdr; int ret; sd_init_req(&hdr, SD_OP_COMPLETE_RECOVERY); hdr.obj.tgt_epoch = rw->epoch; hdr.flags = SD_FLAG_CMD_WRITE; hdr.data_length = sizeof(sys->this_node); ret = exec_local_req(&hdr, &sys->this_node); if (ret != SD_RES_SUCCESS) sd_err("failed to notify recovery completion, %d", rw->epoch); } static void notify_recovery_completion_main(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); free_recovery_work(rw); } static inline void finish_recovery(struct recovery_info *rinfo) { uint32_t recovered_epoch = rinfo->epoch; main_thread_set(current_rinfo, NULL); wakeup_all_requests(); if (rinfo->notify_complete) { rinfo->state = RW_NOTIFY_COMPLETION; queue_recovery_work(rinfo); } free_recovery_info(rinfo); sd_debug("recovery complete: new epoch %"PRIu32, recovered_epoch); } static inline bool oid_in_prio_oids(struct recovery_info *rinfo, uint64_t oid) { for (uint64_t i = 0; i < rinfo->nr_prio_oids; i++) if (rinfo->prio_oids[i] == oid) return true; return false; } /* * Schedule prio_oids to be recovered first in FIFO order * * rw->done is index of the original next object to be recovered and also the * number of objects already recovered. * we just move rw->prio_oids in between: * new_oids = [0..rw->done - 1] + [rw->prio_oids] + [rw->done] */ static inline void finish_schedule_oids(struct recovery_info *rinfo) { uint64_t i, nr_recovered = rinfo->done, new_idx; uint64_t *new_oids; /* If I am the last oid, done */ if (nr_recovered == rinfo->count - 1) goto done; new_oids = xmalloc(list_buffer_size); memcpy(new_oids, rinfo->oids, nr_recovered * sizeof(uint64_t)); memcpy(new_oids + nr_recovered, rinfo->prio_oids, rinfo->nr_prio_oids * sizeof(uint64_t)); new_idx = nr_recovered + rinfo->nr_prio_oids; for (i = rinfo->done; i < rinfo->count; i++) { if (oid_in_prio_oids(rinfo, rinfo->oids[i])) continue; new_oids[new_idx++] = rinfo->oids[i]; } /* rw->count should eq new_idx, otherwise something is wrong */ sd_debug("%snr_recovered %" PRIu64 ", nr_prio_oids %" PRIu64 ", count %" PRIu64 " = new %" PRIu64, rinfo->count == new_idx ? "" : "WARN: ", nr_recovered, rinfo->nr_prio_oids, rinfo->count, new_idx); free(rinfo->oids); rinfo->oids = new_oids; done: free(rinfo->prio_oids); rinfo->prio_oids = NULL; rinfo->nr_scheduled_prio_oids += rinfo->nr_prio_oids; rinfo->nr_prio_oids = 0; } /* * When automatic object recovery is disabled, the behavior of the * recovery process is like 'lazy recovery'. This function returns * true if the recovery queue contains objects being accessed by * clients. Sheep recovers such objects for availability even when * automatic object recovery is not enabled. */ static bool has_scheduled_objects(struct recovery_info *rinfo) { return rinfo->done < rinfo->nr_scheduled_prio_oids; } static void recover_next_object(struct recovery_info *rinfo) { if (run_next_rw()) return; if (rinfo->nr_prio_oids) finish_schedule_oids(rinfo); if (sys->cinfo.disable_recovery && !has_scheduled_objects(rinfo)) { sd_debug("suspended"); rinfo->suspended = true; /* suspend until resume_suspended_recovery() is called */ return; } /* Try recover next object */ queue_recovery_work(rinfo); } void resume_suspended_recovery(void) { struct recovery_info *rinfo = main_thread_get(current_rinfo); if (rinfo && rinfo->suspended) { rinfo->suspended = false; recover_next_object(rinfo); } } static void recover_object_main(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); struct recovery_obj_work *row = container_of(rw, struct recovery_obj_work, base); struct recovery_info *rinfo = main_thread_get(current_rinfo); if (run_next_rw()) goto out; if (row->stop) { /* * Stop this recovery process and wait for epoch to be * lifted and flush wait queue to requeue those * requests */ rinfo->notify_complete = false; finish_recovery(rinfo); sd_debug("recovery is stopped"); goto out; } wakeup_requests_on_oid(row->oid); rinfo->done++; sd_info("object %"PRIx64" is recovered (%"PRIu64"/%"PRIu64")", row->oid, rinfo->done, rinfo->count); if (rinfo->done < rinfo->count) { recover_next_object(rinfo); goto out; } finish_recovery(rinfo); out: free_recovery_obj_work(row); } static void finish_object_list(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); struct recovery_list_work *rlw = container_of(rw, struct recovery_list_work, base); struct recovery_info *rinfo = main_thread_get(current_rinfo); rinfo->state = RW_RECOVER_OBJ; rinfo->count = rlw->count; rinfo->oids = rlw->oids; rlw->oids = NULL; free_recovery_list_work(rlw); if (run_next_rw()) return; if (!rinfo->count) { finish_recovery(rinfo); return; } recover_next_object(rinfo); return; } /* Fetch the object list from all the nodes in the cluster */ static uint64_t *fetch_object_list(struct sd_node *e, uint32_t epoch, size_t *nr_oids) { struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; size_t buf_size = list_buffer_size; uint64_t *buf = xmalloc(buf_size); int ret; sd_debug("%s", addr_to_str(e->nid.addr, e->nid.port)); retry: sd_init_req(&hdr, SD_OP_GET_OBJ_LIST); hdr.data_length = buf_size; hdr.epoch = epoch; ret = sheep_exec_req(&e->nid, &hdr, buf); switch (ret) { case SD_RES_SUCCESS: break; case SD_RES_BUFFER_SMALL: buf_size *= 2; buf = xrealloc(buf, buf_size); goto retry; default: sd_alert("cannot get object list from %s", addr_to_str(e->nid.addr, e->nid.port)); sd_alert("some objects may be not recovered at epoch %d", epoch); free(buf); return NULL; } *nr_oids = rsp->data_length / sizeof(uint64_t); sd_debug("%zu", *nr_oids); return buf; } /* Screen out objects that don't belong to this node */ static void screen_object_list(struct recovery_list_work *rlw, uint64_t *oids, size_t nr_oids) { struct recovery_work *rw = &rlw->base; const struct sd_vnode *vnodes[SD_MAX_COPIES]; uint64_t old_count = rlw->count; uint64_t nr_objs; uint64_t i, j; for (i = 0; i < nr_oids; i++) { if (xbsearch(&oids[i], rlw->oids, old_count, obj_cmp)) /* the object is already scheduled to be recovered */ continue; nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones); if (!nr_objs) { sd_err("ERROR: can not find copy number for object %" PRIx64, oids[i]); continue; } oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes, oids[i], nr_objs, vnodes); for (j = 0; j < nr_objs; j++) { if (!vnode_is_local(vnodes[j])) continue; rlw->oids[rlw->count++] = oids[i]; /* enlarge the list buffer if full */ if (rlw->count == list_buffer_size / sizeof(uint64_t)) { list_buffer_size *= 2; rlw->oids = xrealloc(rlw->oids, list_buffer_size); } break; } } xqsort(rlw->oids, rlw->count, obj_cmp); } /* Prepare the object list that belongs to this node */ static void prepare_object_list(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, work); struct recovery_list_work *rlw = container_of(rw, struct recovery_list_work, base); struct sd_node *cur = rw->cur_vinfo->nodes; int cur_nr = rw->cur_vinfo->nr_nodes; int start = random() % cur_nr, i, end = cur_nr; uint64_t *oids; if (node_is_gateway_only()) return; sd_debug("%u", rw->epoch); wait_get_vdis_done(); again: /* We need to start at random node for better load balance */ for (i = start; i < end; i++) { size_t nr_oids; struct sd_node *node = cur + i; if (uatomic_read(&next_rinfo)) { sd_debug("go to the next recovery"); return; } oids = fetch_object_list(node, rw->epoch, &nr_oids); if (!oids) continue; screen_object_list(rlw, oids, nr_oids); free(oids); } if (start != 0) { end = start; start = 0; goto again; } sd_debug("%"PRIu64, rlw->count); } int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo, bool epoch_lifted) { struct recovery_info *rinfo; rinfo = xzalloc(sizeof(struct recovery_info)); rinfo->state = RW_PREPARE_LIST; rinfo->epoch = sys->cinfo.epoch; rinfo->tgt_epoch = epoch_lifted ? sys->cinfo.epoch - 1 : sys->cinfo.epoch; rinfo->count = 0; if (epoch_lifted) rinfo->notify_complete = true; /* Reweight or node recovery */ else rinfo->notify_complete = false; /* MD recovery */ rinfo->cur_vinfo = grab_vnode_info(cur_vinfo); rinfo->old_vinfo = grab_vnode_info(old_vinfo); if (!node_is_gateway_only()) sd_store->update_epoch(rinfo->tgt_epoch); if (main_thread_get(current_rinfo) != NULL) { /* skip the previous epoch recovery */ struct recovery_info *nrinfo; nrinfo = uatomic_xchg_ptr(&next_rinfo, rinfo); if (nrinfo) free_recovery_info(nrinfo); sd_debug("recovery skipped"); /* * This is necesary to invoke run_next_rw when * recovery work is suspended. */ resume_suspended_recovery(); } else { main_thread_set(current_rinfo, rinfo); queue_recovery_work(rinfo); } wakeup_requests_on_epoch(); return 0; } static void queue_recovery_work(struct recovery_info *rinfo) { struct recovery_work *rw; struct recovery_list_work *rlw; struct recovery_obj_work *row; switch (rinfo->state) { case RW_PREPARE_LIST: rlw = xzalloc(sizeof(*rlw)); rlw->oids = xmalloc(list_buffer_size); rw = &rlw->base; rw->work.fn = prepare_object_list; rw->work.done = finish_object_list; break; case RW_RECOVER_OBJ: row = xzalloc(sizeof(*row)); row->oid = rinfo->oids[rinfo->done]; rw = &row->base; rw->work.fn = recover_object_work; rw->work.done = recover_object_main; break; case RW_NOTIFY_COMPLETION: rw = xzalloc(sizeof(*rw)); rw->work.fn = notify_recovery_completion_work; rw->work.done = notify_recovery_completion_main; break; default: panic("unknow recovery state %d", rinfo->state); break; } rw->epoch = rinfo->epoch; rw->tgt_epoch = rinfo->tgt_epoch; rw->cur_vinfo = grab_vnode_info(rinfo->cur_vinfo); rw->old_vinfo = grab_vnode_info(rinfo->old_vinfo); queue_work(sys->recovery_wqueue, &rw->work); } void get_recovery_state(struct recovery_state *state) { struct recovery_info *rinfo = main_thread_get(current_rinfo); memset(state, 0, sizeof(*state)); if (!rinfo) { state->in_recovery = 0; return; } state->in_recovery = 1; state->state = rinfo->state; state->nr_finished = rinfo->done; state->nr_total = rinfo->count; } sheepdog-0.7.5/sheep/request.c000066400000000000000000000515531223630776600163060ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include "sheep_priv.h" static void requeue_request(struct request *req); static void del_requeue_request(struct request *req) { list_del(&req->request_list); requeue_request(req); } static bool is_access_local(struct request *req, uint64_t oid) { const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; int nr_copies; int i; nr_copies = get_req_copy_number(req); oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { if (vnode_is_local(obj_vnodes[i])) return true; } return false; } static void io_op_done(struct work *work) { struct request *req = container_of(work, struct request, work); switch (req->rp.result) { case SD_RES_EIO: req->rp.result = SD_RES_NETWORK_ERROR; sd_err("leaving sheepdog cluster"); leave_cluster(); break; case SD_RES_SUCCESS: case SD_RES_NETWORK_ERROR: break; default: sd_debug("unhandled error %s", sd_strerror(req->rp.result)); break; } put_request(req); return; } /* * There are 4 cases that a request needs to sleep on wait queues for requeue: * * 1. Epoch of request sender is older than system epoch of receiver * In this case, we response the sender with SD_RES_OLD_NODE_VER to * sender so sender would put the request into its own wait queue to * wait its system epoch get lifted and resend the request. * * 2. Epoch of request sender is newer than system epoch of receiver * In this case, we put the request into wait queue of receiver, to wait * system epoch of receiver to get lifted, then retry this request on * its own. * * 3. Object requested doesn't exist and recovery work is at RW_INIT state * In this case, we check whether the requested object exists, if so, * go process the request directly, if not put the request into wait * queue of the receiver to wait for the finish of this oid recovery. * * 4. Object requested doesn't exist and is being recoverred * In this case, we put the request into wait queue of receiver and when * we recover an object we try to wake up the request on this oid. */ static inline void sleep_on_wait_queue(struct request *req) { list_add_tail(&req->request_list, &sys->req_wait_queue); } static void gateway_op_done(struct work *work) { struct request *req = container_of(work, struct request, work); struct sd_req *hdr = &req->rq; switch (req->rp.result) { case SD_RES_OLD_NODE_VER: if (req->rp.epoch > sys->cinfo.epoch) { /* * Gateway of this node is expected to process this * request later when epoch is lifted. */ sleep_on_wait_queue(req); return; } /*FALLTHRU*/ case SD_RES_NEW_NODE_VER: case SD_RES_NETWORK_ERROR: case SD_RES_WAIT_FOR_JOIN: case SD_RES_WAIT_FOR_FORMAT: case SD_RES_KILLED: sd_debug("retrying failed I/O request op %s result %x epoch %" PRIu32 ", sys epoch %" PRIu32, op_name(req->op), req->rp.result, req->rq.epoch, sys->cinfo.epoch); goto retry; case SD_RES_EIO: if (is_access_local(req, hdr->obj.oid)) { sd_err("leaving sheepdog cluster"); leave_cluster(); goto retry; } break; case SD_RES_SUCCESS: break; default: sd_debug("unhandled error %s", sd_strerror(req->rp.result)); break; } put_request(req); return; retry: requeue_request(req); } static void local_op_done(struct work *work) { struct request *req = container_of(work, struct request, work); if (has_process_main(req->op)) { req->rp.result = do_process_main(req->op, &req->rq, &req->rp, req->data); } put_request(req); } static int check_request_epoch(struct request *req) { if (before(req->rq.epoch, sys->cinfo.epoch)) { sd_err("old node version %u, %u (%s)", sys->cinfo.epoch, req->rq.epoch, op_name(req->op)); /* Ask for sleeping req on requester's wait queue */ req->rp.result = SD_RES_OLD_NODE_VER; req->rp.epoch = sys->cinfo.epoch; put_request(req); return -1; } else if (after(req->rq.epoch, sys->cinfo.epoch)) { sd_err("new node version %u, %u (%s)", sys->cinfo.epoch, req->rq.epoch, op_name(req->op)); /* Wait for local epoch to be lifted */ req->rp.result = SD_RES_NEW_NODE_VER; sleep_on_wait_queue(req); return -1; } return 0; } static bool request_in_recovery(struct request *req) { /* * For CREATE request, we simply service it. CREATE operations are * atomic, so it cannot happen for recover process to overwrite the * created objects with the older data. */ if (req->rq.opcode == SD_OP_CREATE_AND_WRITE_PEER || req->rq.opcode == SD_OP_CREATE_AND_WRITE_OBJ) return false; /* * Request from recovery should go down the Farm even if * oid_in_recovery() returns true because we should also try snap * cache of the Farm and return the error code back if not found. */ if (oid_in_recovery(req->local_oid) && !(req->rq.flags & SD_FLAG_CMD_RECOVERY)) { sd_debug("%"PRIx64" wait on oid", req->local_oid); sleep_on_wait_queue(req); return true; } return false; } /* Wakeup requests because of epoch mismatch */ void wakeup_requests_on_epoch(void) { struct request *req, *t; LIST_HEAD(pending_list); list_splice_init(&sys->req_wait_queue, &pending_list); list_for_each_entry_safe(req, t, &pending_list, request_list) { switch (req->rp.result) { case SD_RES_OLD_NODE_VER: /* * Gateway retries to send the request when * its epoch changes. */ assert(is_gateway_op(req->op)); sd_debug("gateway %"PRIx64, req->rq.obj.oid); req->rq.epoch = sys->cinfo.epoch; del_requeue_request(req); break; case SD_RES_NEW_NODE_VER: /* * Peer retries the request locally when its epoch * changes. */ assert(!is_gateway_op(req->op)); sd_debug("peer %"PRIx64, req->rq.obj.oid); del_requeue_request(req); break; default: break; } } list_splice_init(&pending_list, &sys->req_wait_queue); } /* Wakeup the requests on the oid that was previously being recoverred */ void wakeup_requests_on_oid(uint64_t oid) { struct request *req, *t; LIST_HEAD(pending_list); list_splice_init(&sys->req_wait_queue, &pending_list); list_for_each_entry_safe(req, t, &pending_list, request_list) { if (req->local_oid != oid) continue; sd_debug("retry %" PRIx64, req->local_oid); del_requeue_request(req); } list_splice_init(&pending_list, &sys->req_wait_queue); } void wakeup_all_requests(void) { struct request *req, *n; LIST_HEAD(pending_list); list_splice_init(&sys->req_wait_queue, &pending_list); list_for_each_entry_safe(req, n, &pending_list, request_list) { sd_debug("%"PRIx64, req->rq.obj.oid); del_requeue_request(req); } } static void queue_peer_request(struct request *req) { req->local_oid = req->rq.obj.oid; if (req->local_oid) { if (check_request_epoch(req) < 0) return; if (request_in_recovery(req)) return; } if (req->rq.flags & SD_FLAG_CMD_RECOVERY) req->rq.epoch = req->rq.obj.tgt_epoch; req->work.fn = do_process_work; req->work.done = io_op_done; queue_work(sys->io_wqueue, &req->work); } static void queue_gateway_request(struct request *req) { struct sd_req *hdr = &req->rq; if (is_access_local(req, hdr->obj.oid)) req->local_oid = hdr->obj.oid; /* * If we go for cache object, we don't care if it is being recovered * Even if it doesn't exist in cache, we'll rely on cache layer to pull * it. * * Not ture for local request because it might go for backend store * such as pushing cache object, in this case we should check if request * is in recovery. */ if (sys->enable_object_cache && !req->local) goto queue_work; if (req->local_oid) if (request_in_recovery(req)) return; queue_work: req->work.fn = do_process_work; req->work.done = gateway_op_done; queue_work(sys->gateway_wqueue, &req->work); } static void queue_local_request(struct request *req) { req->work.fn = do_process_work; req->work.done = local_op_done; queue_work(sys->io_wqueue, &req->work); } static void queue_request(struct request *req) { struct sd_req *hdr = &req->rq; struct sd_rsp *rsp = &req->rp; /* * Check the protocol version for all internal commands, and public * commands that have it set. We can't enforce it on all public * ones as it isn't a mandatory part of the public protocol. */ if (hdr->opcode >= 0x80) { if (hdr->proto_ver != SD_SHEEP_PROTO_VER) { rsp->result = SD_RES_VER_MISMATCH; goto done; } } else if (hdr->proto_ver) { if (hdr->proto_ver > SD_PROTO_VER) { rsp->result = SD_RES_VER_MISMATCH; goto done; } } req->op = get_sd_op(hdr->opcode); if (!req->op) { sd_err("invalid opcode %d", hdr->opcode); rsp->result = SD_RES_INVALID_PARMS; goto done; } sd_debug("%s, %d", op_name(req->op), sys->cinfo.status); switch (sys->cinfo.status) { case SD_STATUS_KILLED: rsp->result = SD_RES_KILLED; goto done; case SD_STATUS_SHUTDOWN: rsp->result = SD_RES_SHUTDOWN; goto done; case SD_STATUS_WAIT: if (!is_force_op(req->op)) { if (sys->cinfo.ctime == 0) rsp->result = SD_RES_WAIT_FOR_FORMAT; else rsp->result = SD_RES_WAIT_FOR_JOIN; goto done; } break; default: break; } req->vinfo = get_vnode_info(); if (is_peer_op(req->op)) { queue_peer_request(req); } else if (is_gateway_op(req->op)) { hdr->epoch = sys->cinfo.epoch; queue_gateway_request(req); } else if (is_local_op(req->op)) { hdr->epoch = sys->cinfo.epoch; queue_local_request(req); } else if (is_cluster_op(req->op)) { hdr->epoch = sys->cinfo.epoch; queue_cluster_request(req); } else { sd_err("unknown operation %d", hdr->opcode); rsp->result = SD_RES_SYSTEM_ERROR; goto done; } return; done: put_request(req); } static void requeue_request(struct request *req) { if (req->vinfo) { put_vnode_info(req->vinfo); req->vinfo = NULL; } queue_request(req); } static void clear_client_info(struct client_info *ci); static struct request *alloc_local_request(void *data, int data_length) { struct request *req; req = xzalloc(sizeof(struct request)); if (data_length) { req->data_length = data_length; req->data = data; } req->local = true; INIT_LIST_HEAD(&req->request_list); refcount_set(&req->refcnt, 1); return req; } static void free_local_request(struct request *req) { put_vnode_info(req->vinfo); free(req); } /* * Exec the request locally and synchronously. * * This function takes advantage of gateway's retry mechanism and can be only * called from worker thread. */ worker_fn int exec_local_req(struct sd_req *rq, void *data) { struct request *req; int ret; req = alloc_local_request(data, rq->data_length); req->rq = *rq; req->local_req_efd = eventfd(0, 0); if (req->local_req_efd < 0) { /* Fake the result to ask for retry */ req->rp.result = SD_RES_NETWORK_ERROR; goto out; } pthread_mutex_lock(&sys->local_req_lock); list_add_tail(&req->request_list, &sys->local_req_queue); pthread_mutex_unlock(&sys->local_req_lock); eventfd_xwrite(sys->local_req_efd, 1); eventfd_xread(req->local_req_efd); out: /* fill rq with response header as exec_req does */ memcpy(rq, &req->rp, sizeof(req->rp)); close(req->local_req_efd); ret = req->rp.result; free_local_request(req); return ret; } static struct request *alloc_request(struct client_info *ci, int data_length) { struct request *req; req = zalloc(sizeof(struct request)); if (!req) return NULL; req->ci = ci; refcount_inc(&ci->refcnt); if (data_length) { req->data_length = data_length; req->data = valloc(data_length); if (!req->data) { free(req); return NULL; } } INIT_LIST_HEAD(&req->request_list); refcount_set(&req->refcnt, 1); uatomic_inc(&sys->nr_outstanding_reqs); return req; } static void free_request(struct request *req) { uatomic_dec(&sys->nr_outstanding_reqs); refcount_dec(&req->ci->refcnt); put_vnode_info(req->vinfo); free(req->data); free(req); } main_fn void put_request(struct request *req) { struct client_info *ci = req->ci; if (refcount_dec(&req->refcnt) > 0) return; if (req->local) eventfd_xwrite(req->local_req_efd, 1); else { if (conn_tx_on(&ci->conn)) { clear_client_info(ci); free_request(req); } else { list_add(&req->request_list, &ci->done_reqs); } } } static void init_rx_hdr(struct client_info *ci) { ci->conn.c_rx_state = C_IO_HEADER; ci->rx_req = NULL; ci->conn.rx_length = sizeof(struct sd_req); ci->conn.rx_buf = &ci->conn.rx_hdr; } static inline int begin_rx(struct client_info *ci) { int ret; uint64_t data_len; struct connection *conn = &ci->conn; struct sd_req *hdr = &conn->rx_hdr; struct request *req; switch (conn->c_rx_state) { case C_IO_HEADER: ret = rx(conn, C_IO_DATA_INIT); if (!ret || conn->c_rx_state != C_IO_DATA_INIT) break; case C_IO_DATA_INIT: data_len = hdr->data_length; req = alloc_request(ci, data_len); if (!req) { conn->c_rx_state = C_IO_CLOSED; break; } ci->rx_req = req; /* use le_to_cpu */ memcpy(&req->rq, hdr, sizeof(req->rq)); if (data_len && hdr->flags & SD_FLAG_CMD_WRITE) { conn->c_rx_state = C_IO_DATA; conn->rx_length = data_len; conn->rx_buf = req->data; } else { conn->c_rx_state = C_IO_END; break; } case C_IO_DATA: ret = rx(conn, C_IO_END); break; default: sd_err("bug: unknown state %d", conn->c_rx_state); } if (is_conn_dead(conn)) { clear_client_info(ci); return -1; } /* Short read happens */ if (conn->c_rx_state != C_IO_END) return -1; return 0; } static inline void finish_rx(struct client_info *ci) { struct request *req; req = ci->rx_req; init_rx_hdr(ci); sd_debug("%d, %s:%d", ci->conn.fd, ci->conn.ipstr, ci->conn.port); queue_request(req); } static void do_client_rx(struct client_info *ci) { if (begin_rx(ci) < 0) return; finish_rx(ci); } static void init_tx_hdr(struct client_info *ci) { struct sd_rsp *rsp = (struct sd_rsp *)&ci->conn.tx_hdr; struct request *req; assert(!list_empty(&ci->done_reqs)); memset(rsp, 0, sizeof(*rsp)); req = list_first_entry(&ci->done_reqs, struct request, request_list); list_del(&req->request_list); ci->tx_req = req; ci->conn.tx_length = sizeof(*rsp); ci->conn.c_tx_state = C_IO_HEADER; ci->conn.tx_buf = rsp; /* use cpu_to_le */ memcpy(rsp, &req->rp, sizeof(*rsp)); rsp->epoch = sys->cinfo.epoch; rsp->opcode = req->rq.opcode; rsp->id = req->rq.id; } static inline int begin_tx(struct client_info *ci) { int ret, opt; struct sd_rsp *rsp = (struct sd_rsp *)&ci->conn.tx_hdr; /* If short send happens, we don't need init hdr */ if (!ci->tx_req) init_tx_hdr(ci); opt = 1; setsockopt(ci->conn.fd, SOL_TCP, TCP_CORK, &opt, sizeof(opt)); switch (ci->conn.c_tx_state) { case C_IO_HEADER: ret = tx(&ci->conn, C_IO_DATA_INIT); if (!ret) break; if (rsp->data_length) { ci->conn.tx_length = rsp->data_length; ci->conn.tx_buf = ci->tx_req->data; ci->conn.c_tx_state = C_IO_DATA; } else { ci->conn.c_tx_state = C_IO_END; break; } case C_IO_DATA: ret = tx(&ci->conn, C_IO_END); if (!ret) break; default: break; } opt = 0; setsockopt(ci->conn.fd, SOL_TCP, TCP_CORK, &opt, sizeof(opt)); if (is_conn_dead(&ci->conn)) { clear_client_info(ci); return -1; } return 0; } /* Return 1 if short send happens or we have more data to send */ static inline int finish_tx(struct client_info *ci) { /* Finish sending one response */ if (ci->conn.c_tx_state == C_IO_END) { sd_debug("connection from: %d, %s:%d", ci->conn.fd, ci->conn.ipstr, ci->conn.port); free_request(ci->tx_req); ci->tx_req = NULL; } if (ci->tx_req || !list_empty(&ci->done_reqs)) return 1; return 0; } static void do_client_tx(struct client_info *ci) { if (!ci->tx_req && list_empty(&ci->done_reqs)) { if (conn_tx_off(&ci->conn)) clear_client_info(ci); return; } if (begin_tx(ci) < 0) return; if (finish_tx(ci)) return; /* Let's go sleep, and put_request() will wake me up */ if (conn_tx_off(&ci->conn)) clear_client_info(ci); } static void destroy_client(struct client_info *ci) { sd_debug("connection from: %s:%d", ci->conn.ipstr, ci->conn.port); close(ci->conn.fd); free(ci); } static void clear_client_info(struct client_info *ci) { struct request *req, *t; sd_debug("connection seems to be dead"); if (ci->rx_req) { free_request(ci->rx_req); ci->rx_req = NULL; } if (ci->tx_req) { free_request(ci->tx_req); ci->tx_req = NULL; } list_for_each_entry_safe(req, t, &ci->done_reqs, request_list) { list_del(&req->request_list); free_request(req); } unregister_event(ci->conn.fd); sd_debug("refcnt:%d, fd:%d, %s:%d", refcount_read(&ci->refcnt), ci->conn.fd, ci->conn.ipstr, ci->conn.port); if (refcount_read(&ci->refcnt)) return; destroy_client(ci); } static struct client_info *create_client(int fd, struct cluster_info *cluster) { struct client_info *ci; struct sockaddr_storage from; socklen_t namesize = sizeof(from); ci = zalloc(sizeof(*ci)); if (!ci) return NULL; if (getpeername(fd, (struct sockaddr *)&from, &namesize)) return NULL; switch (from.ss_family) { case AF_INET: ci->conn.port = ntohs(((struct sockaddr_in *)&from)->sin_port); inet_ntop(AF_INET, &((struct sockaddr_in *)&from)->sin_addr, ci->conn.ipstr, sizeof(ci->conn.ipstr)); break; case AF_INET6: ci->conn.port = ntohs(((struct sockaddr_in6 *)&from)->sin6_port); inet_ntop(AF_INET6, &((struct sockaddr_in6 *)&from)->sin6_addr, ci->conn.ipstr, sizeof(ci->conn.ipstr)); break; } ci->conn.fd = fd; ci->conn.events = EPOLLIN; refcount_set(&ci->refcnt, 0); INIT_LIST_HEAD(&ci->done_reqs); init_rx_hdr(ci); return ci; } static void client_handler(int fd, int events, void *data) { struct client_info *ci = (struct client_info *)data; sd_debug("%x, rx %d, tx %d", events, ci->conn.c_rx_state, ci->conn.c_tx_state); if (events & (EPOLLERR | EPOLLHUP) || is_conn_dead(&ci->conn)) return clear_client_info(ci); if (events & EPOLLIN) do_client_rx(ci); if (events & EPOLLOUT) do_client_tx(ci); } static void listen_handler(int listen_fd, int events, void *data) { struct sockaddr_storage from; socklen_t namesize; int fd, ret; struct client_info *ci; bool is_inet_socket = *(bool *)data; if (sys->cinfo.status == SD_STATUS_SHUTDOWN) { sd_debug("unregistering connection %d", listen_fd); unregister_event(listen_fd); return; } namesize = sizeof(from); fd = accept(listen_fd, (struct sockaddr *)&from, &namesize); if (fd < 0) { sd_err("failed to accept a new connection: %m"); return; } if (is_inet_socket) { ret = set_nodelay(fd); if (ret) { close(fd); return; } } ret = set_nonblocking(fd); if (ret) { close(fd); return; } ci = create_client(fd, data); if (!ci) { close(fd); return; } ret = register_event(fd, client_handler, ci); if (ret) { destroy_client(ci); return; } sd_debug("accepted a new connection: %d", fd); } static int create_listen_port_fn(int fd, void *data) { return register_event(fd, listen_handler, data); } int create_listen_port(char *bindaddr, int port) { static bool is_inet_socket = true; return create_listen_ports(bindaddr, port, create_listen_port_fn, &is_inet_socket); } int init_unix_domain_socket(const char *dir) { static bool is_inet_socket; char unix_path[PATH_MAX]; snprintf(unix_path, sizeof(unix_path), "%s/sock", dir); unlink(unix_path); return create_unix_domain_socket(unix_path, create_listen_port_fn, &is_inet_socket); } static void local_req_handler(int listen_fd, int events, void *data) { struct request *req, *t; LIST_HEAD(pending_list); if (events & EPOLLERR) sd_err("request handler error"); eventfd_xread(listen_fd); pthread_mutex_lock(&sys->local_req_lock); list_splice_init(&sys->local_req_queue, &pending_list); pthread_mutex_unlock(&sys->local_req_lock); list_for_each_entry_safe(req, t, &pending_list, request_list) { list_del(&req->request_list); queue_request(req); } } void local_req_init(void) { pthread_mutex_init(&sys->local_req_lock, NULL); sys->local_req_efd = eventfd(0, EFD_NONBLOCK); if (sys->local_req_efd < 0) panic("failed to init local req efd"); register_event(sys->local_req_efd, local_req_handler, NULL); } worker_fn int sheep_exec_req(const struct node_id *nid, struct sd_req *hdr, void *buf) { struct sd_rsp *rsp = (struct sd_rsp *)hdr; struct sockfd *sfd; int ret; sfd = sockfd_cache_get(nid); if (!sfd) return SD_RES_NETWORK_ERROR; ret = exec_req(sfd->fd, hdr, buf, sheep_need_retry, hdr->epoch, MAX_RETRY_COUNT); if (ret) { sd_debug("remote node might have gone away"); sockfd_cache_del(nid, sfd); return SD_RES_NETWORK_ERROR; } ret = rsp->result; if (ret != SD_RES_SUCCESS) sd_err("failed %s", sd_strerror(ret)); sockfd_cache_put(nid, sfd); return ret; } bool sheep_need_retry(uint32_t epoch) { return sys_epoch() == epoch; } sheepdog-0.7.5/sheep/sheep.c000066400000000000000000000526661223630776600157300ustar00rootroot00000000000000/* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include "sheep_priv.h" #include "trace/trace.h" #include "option.h" #define EPOLL_SIZE 4096 #define DEFAULT_OBJECT_DIR "/tmp" #define LOG_FILE_NAME "sheep.log" LIST_HEAD(cluster_drivers); static const char program_name[] = "sheep"; static const char bind_help[] = "Example:\n\t$ sheep -b 192.168.1.1 ...\n" "This tries to teach sheep listen to NIC of 192.168.1.1.\n" "\nExample:\n\t$ sheep -b 0.0.0.0 ...\n" "This tries to teach sheep listen to all the NICs available. It can be useful\n" "when you want sheep to response dog without specified address and port.\n"; static const char ioaddr_help[] = "Example:\n\t$ sheep -i host=192.168.1.1,port=7002 ...\n" "This tries to add a dedicated IO NIC of 192.168.1.1:7002 to transfer data.\n" "If IO NIC is down, sheep will fallback to non IO NIC to transfer data.\n"; static const char journal_help[] = "Available arguments:\n" "\tsize=: size of the journal in megabyes\n" "\tdir=: path to the location of the journal (default: $STORE)\n" "\tskip: if specified, skip the recovery at startup\n" "\nExample:\n\t$ sheep -j dir=/journal,size=1024\n" "This tries to use /journal as the journal storage of the size 1024M\n"; static const char loglevel_help[] = "Available log levels:\n" " # Level Description\n" " 0 SDOG_EMERG system has failed and is unusable\n" " 1 SDOG_ALERT action must be taken immediately\n" " 2 SDOG_CRIT critical conditions\n" " 3 SDOG_ERR error conditions\n" " 4 SDOG_WARNING warning conditions\n" " 5 SDOG_NOTICE normal but significant conditions\n" " 6 SDOG_INFO informational notices\n" " 7 SDOG_DEBUG debugging messages\n" "\nExample:\n\t$ sheep -l 4 ...\n" "This only allows logs with level smaller than SDOG_WARNING to be logged\n"; static const char http_help[] = "Example:\n\t$ sheep -r localhost:7001 ...\n" "This tries to enable sheep as http service backend and use localhost:7001 to\n" "communicate with http server. Not fully implemented yet.\n"; static const char myaddr_help[] = "Example:\n\t$ sheep -y 192.168.1.1:7000 ...\n" "This tries to tell other nodes through what address they can talk to this\n" "sheep.\n"; static const char zone_help[] = "Example:\n\t$ sheep -z 1 ...\n" "This tries to set the zone ID of this sheep to 1 and sheepdog won't store\n" "more than one copy of any object into this same zone\n"; static const char cluster_help[] = "Available arguments:\n" "\tlocal: use local driver\n" "\tcorosync: use corosync driver (default)\n" "\tzookeeper: use zookeeper driver, need extra arguments\n" "\n\tzookeeper arguments: address-list,tiemout=value (default as 3000)\n" "\nExample:\n\t" "$ sheep -c zookeeperr:IP1:PORT1,IP2:PORT2,IP3:PORT3,timeout=1000 ...\n" "This tries to use 3 node zookeeper cluster, which can be reached by\n" "IP1:PORT1, IP2:PORT2, IP3:PORT3 to manage membership and broadcast message\n" "and set the timeout of node heartbeat as 1000 milliseconds\n"; static const char cache_help[] = "Available arguments:\n" "\tsize=: size of the cache in megabyes\n" "\tdir=: path to the location of the cache (default: $STORE/cache)\n" "\tdirectio: use directio mode for cache IO, " "if not specified use buffered IO\n" "\nExample:\n\t$ sheep -w size=200000,dir=/my_ssd,directio ...\n" "This tries to use /my_ssd as the cache storage with 200G allocted to the\n" "cache in directio mode\n"; static struct sd_option sheep_options[] = { {'b', "bindaddr", true, "specify IP address of interface to listen on", bind_help}, {'c', "cluster", true, "specify the cluster driver (default: "DEFAULT_CLUSTER_DRIVER")", cluster_help}, {'d', "debug", false, "include debug messages in the log"}, {'D', "directio", false, "use direct IO for backend store"}, {'f', "foreground", false, "make the program run in the foreground"}, {'F', "log-format", true, "specify log format"}, {'g', "gateway", false, "make the progam run as a gateway mode"}, {'h', "help", false, "display this help and exit"}, {'i', "ioaddr", true, "use separate network card to handle IO requests", ioaddr_help}, {'j', "journal", true, "use jouranl file to log all the write " "operations", journal_help}, {'l', "loglevel", true, "specify the level of logging detail " "(default: 6 [SDOG_INFO])", loglevel_help}, {'n', "nosync", false, "drop O_SYNC for write of backend"}, {'o', "stdout", false, "log to stdout instead of shared logger"}, {'p', "port", true, "specify the TCP port on which to listen " "(default: 7000)"}, {'P', "pidfile", true, "create a pid file"}, {'r', "http", true, "enable http service", http_help}, {'u', "upgrade", false, "upgrade to the latest data layout"}, {'v', "version", false, "show the version"}, {'w', "cache", true, "enable object cache", cache_help}, {'y', "myaddr", true, "specify the address advertised to other sheep", myaddr_help}, {'z', "zone", true, "specify the zone id (default: determined by listen address)", zone_help}, { 0, NULL, false, NULL }, }; static void usage(int status) { if (status) { const char *help = option_get_help(sheep_options, optopt); if (help) { printf("%s", help); goto out; } sd_err("Try '%s --help' for more information.", program_name); } else { struct sd_option *opt; printf("Sheepdog daemon (version %s)\n" "Usage: %s [OPTION]... [PATH] (default: /tmp)\n" "Options:\n", PACKAGE_VERSION, program_name); sd_for_each_option(opt, sheep_options) { printf(" -%c, --%-18s%s\n", opt->ch, opt->name, opt->desc); } printf("\nTry '%s